diff --git a/.typos.toml b/.typos.toml index 195ba8ec..3dc13af8 100644 --- a/.typos.toml +++ b/.typos.toml @@ -3,6 +3,7 @@ extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analy [default.extend-identifiers] # HashiCorp = "HashiCorp" +connexion = "connexion" [default.extend-words] diff --git a/langchain-llamaindex-slackbot/.gitignore b/langchain-llamaindex-slackbot/.gitignore index 6fc2f233..84ad0a94 100644 --- a/langchain-llamaindex-slackbot/.gitignore +++ b/langchain-llamaindex-slackbot/.gitignore @@ -129,7 +129,7 @@ dmypy.json .pyre/ # Zenml -.zen/ +src/.zen/ # MLflow mlruns/ diff --git a/langchain-llamaindex-slackbot/src/local_testing_slackbot.py b/langchain-llamaindex-slackbot/src/local_testing_slackbot.py index f492a5ff..3c1fc5f9 100644 --- a/langchain-llamaindex-slackbot/src/local_testing_slackbot.py +++ b/langchain-llamaindex-slackbot/src/local_testing_slackbot.py @@ -18,10 +18,15 @@ get_vector_store, ) from zenml.logger import get_logger +from zenml.client import Client + +SLACK_BOT_TOKEN = (Client().get_secret("langchain_project_secret") + .secret_values["slack_bot_token"]) +SLACK_APP_TOKEN = (Client().get_secret("langchain_project_secret") + .secret_values["slack_app_token"]) +OPENAI_API_KEY = (Client().get_secret("langchain_project_secret") + .secret_values["openai_api_key"]) -SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN") -SLACK_APP_TOKEN = os.getenv("SLACK_APP_TOKEN") -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") PIPELINE_NAME = os.getenv("PIPELINE_NAME", "zenml_docs_index_generation") logger = get_logger(__name__) @@ -77,7 +82,7 @@ def reply_in_thread(body: dict, say, context): thread_ts = event.get("thread_ts", None) or event["ts"] if context["bot_user_id"] in event["text"]: - logger.debug(f"Received message: {event['text']}") + logger.info(f"Received message: {event['text']}") if event.get("thread_ts", None): full_thread = [ f"{msg['text']}" @@ -107,6 +112,7 @@ def reply_in_thread(body: dict, say, context): question=event["text"], verbose=True, ) + logger.info(output) say(text=output, thread_ts=thread_ts) diff --git a/langchain-llamaindex-slackbot/src/pipelines/index_builder.py b/langchain-llamaindex-slackbot/src/pipelines/index_builder.py index f0275226..1335c3b1 100644 --- a/langchain-llamaindex-slackbot/src/pipelines/index_builder.py +++ b/langchain-llamaindex-slackbot/src/pipelines/index_builder.py @@ -11,17 +11,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. - +import os from steps.index_generator import index_generator from steps.url_scraper import url_scraper from steps.web_url_loader import web_url_loader from zenml import pipeline +from zenml.config import DockerSettings +from zenml.config.docker_settings import SourceFileMode pipeline_name = "zenml_docs_index_generation" +docker_settings = DockerSettings( + requirements=[ + "langchain==0.0.263", + "openai==0.27.2", + "slack-bolt==1.16.2", + "slack-sdk==3.20.0", + "fastapi", + "flask", + "uvicorn", + "gcsfs==2023.5.0", + "faiss-cpu==1.7.3", + "unstructured==0.5.7", + "tiktoken", + "bs4" + ], + source_files=SourceFileMode.DOWNLOAD +) - -@pipeline(name=pipeline_name) +@pipeline(name=pipeline_name, settings={"docker": docker_settings}) def docs_to_index_pipeline( docs_url: str = "", repo_url: str = "", diff --git a/langchain-llamaindex-slackbot/src/requirements-slackbot.txt b/langchain-llamaindex-slackbot/src/requirements-slackbot.txt index 62066bd2..24f3dffa 100644 --- a/langchain-llamaindex-slackbot/src/requirements-slackbot.txt +++ b/langchain-llamaindex-slackbot/src/requirements-slackbot.txt @@ -2,7 +2,7 @@ langchain==0.0.263 openai==0.27.2 slack-bolt==1.16.2 slack-sdk==3.20.0 -zenml[connectors-gcp]==0.45.3 +zenml[connectors-gcp]==0.45.5 fastapi flask uvicorn diff --git a/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt b/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt index d0ceb3dc..1fc6508c 100644 --- a/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt +++ b/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt @@ -2,7 +2,7 @@ langchain>=0.0.125,<=0.0.263 openai>=0.27.2,<=0.27.8 slack-bolt==1.16.2 slack-sdk==3.20.0 -zenml==0.44.1 +zenml==0.45.6 fastapi flask uvicorn @@ -11,3 +11,4 @@ faiss-cpu>=1.7.3,<=1.7.4 unstructured>=0.5.7,<=0.7.8 lanarky==0.7.12 tiktoken +bs4 \ No newline at end of file diff --git a/langchain-llamaindex-slackbot/src/steps/index_generator.py b/langchain-llamaindex-slackbot/src/steps/index_generator.py index 1b3d065d..7e57b888 100644 --- a/langchain-llamaindex-slackbot/src/steps/index_generator.py +++ b/langchain-llamaindex-slackbot/src/steps/index_generator.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. +import os from typing import List @@ -21,10 +22,12 @@ ) from langchain.vectorstores import FAISS, VectorStore from zenml import step +from zenml.client import Client @step(enable_cache=False) def index_generator(documents: List[Document]) -> VectorStore: + os.environ["OPENAI_API_KEY"] = Client().get_secret("langchain_project_secret").secret_values["openai_api_key"] embeddings = OpenAIEmbeddings() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) diff --git a/langchain-llamaindex-slackbot/src/steps/url_scraper.py b/langchain-llamaindex-slackbot/src/steps/url_scraper.py index fe376aee..cbd8b2be 100644 --- a/langchain-llamaindex-slackbot/src/steps/url_scraper.py +++ b/langchain-llamaindex-slackbot/src/steps/url_scraper.py @@ -16,6 +16,7 @@ from steps.url_scraping_utils import get_all_pages from zenml import step +from zenml.client import Client @step(enable_cache=True) @@ -36,5 +37,4 @@ def url_scraper( Returns: List of URLs to scrape. """ - # examples_readme_urls = get_nested_readme_urls(repo_url) return get_all_pages(docs_url) diff --git a/stack-showcase/.dockerignore b/stack-showcase/.dockerignore new file mode 100644 index 00000000..455f4d7a --- /dev/null +++ b/stack-showcase/.dockerignore @@ -0,0 +1,2 @@ +.venv* +.requirements* \ No newline at end of file diff --git a/stack-showcase/README.md b/stack-showcase/README.md new file mode 100644 index 00000000..a0423f92 --- /dev/null +++ b/stack-showcase/README.md @@ -0,0 +1,53 @@ +# 📜 ZenML Stack Show Case + +This project aims to demonstrate the power of stacks. The code in this +project assumes that you have quite a few stacks registered already. + +## default + * `default` Orchestrator + * `default` Artifact Store + +```commandline +zenml stack set default +python run.py --training-pipeline +``` + +## local-sagemaker-step-operator-stack + * `default` Orchestrator + * `s3` Artifact Store + * `local` Image Builder + * `aws` Container Registry + * `Sagemaker` Step Operator + +```commandline +zenml stack set local-sagemaker-step-operator-stack +zenml integration install aws -y +python run.py --training-pipeline +``` + +## sagemaker-airflow-stack + * `Airflow` Orchestrator + * `s3` Artifact Store + * `local` Image Builder + * `aws` Container Registry + * `Sagemaker` Step Operator + +```commandline +zenml stack set sagemaker-airflow-stack +zenml integration install airflow -y +pip install apache-airflow-providers-docker apache-airflow~=2.5.0 +zenml stack up +python run.py --training-pipeline +``` + +## sagemaker-stack + * `Sagemaker` Orchestrator + * `s3` Artifact Store + * `local` Image Builder + * `aws` Container Registry + * `Sagemaker` Step Operator + +```commandline +zenml stack set sagemaker-stack +python run.py --training-pipeline +``` diff --git a/stack-showcase/_assets/airflow_stack.png b/stack-showcase/_assets/airflow_stack.png new file mode 100644 index 00000000..f26a37d9 Binary files /dev/null and b/stack-showcase/_assets/airflow_stack.png differ diff --git a/stack-showcase/_assets/default_stack.png b/stack-showcase/_assets/default_stack.png new file mode 100644 index 00000000..91e768b0 Binary files /dev/null and b/stack-showcase/_assets/default_stack.png differ diff --git a/stack-showcase/_assets/local_sagmaker_so_stack.png b/stack-showcase/_assets/local_sagmaker_so_stack.png new file mode 100644 index 00000000..899e9b64 Binary files /dev/null and b/stack-showcase/_assets/local_sagmaker_so_stack.png differ diff --git a/stack-showcase/_assets/sagemaker_stack.png b/stack-showcase/_assets/sagemaker_stack.png new file mode 100644 index 00000000..f57482ec Binary files /dev/null and b/stack-showcase/_assets/sagemaker_stack.png differ diff --git a/stack-showcase/configs/feature_engineering.yaml b/stack-showcase/configs/feature_engineering.yaml new file mode 100644 index 00000000..daa91a1e --- /dev/null +++ b/stack-showcase/configs/feature_engineering.yaml @@ -0,0 +1,12 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + license: Apache 2.0 + description: Classification of Breast Cancer Dataset. + tags: ["classification", "sklearn"] diff --git a/stack-showcase/configs/inference.yaml b/stack-showcase/configs/inference.yaml new file mode 100644 index 00000000..661b3286 --- /dev/null +++ b/stack-showcase/configs/inference.yaml @@ -0,0 +1,13 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + version: production + license: Apache 2.0 + description: Classification of Breast Cancer Dataset. + tags: ["classification", "sklearn"] diff --git a/stack-showcase/configs/training.yaml b/stack-showcase/configs/training.yaml new file mode 100644 index 00000000..daa91a1e --- /dev/null +++ b/stack-showcase/configs/training.yaml @@ -0,0 +1,12 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + +# configuration of the Model Control Plane +model_version: + name: breast_cancer_classifier + license: Apache 2.0 + description: Classification of Breast Cancer Dataset. + tags: ["classification", "sklearn"] diff --git a/stack-showcase/pipelines/__init__.py b/stack-showcase/pipelines/__init__.py new file mode 100644 index 00000000..16ae3630 --- /dev/null +++ b/stack-showcase/pipelines/__init__.py @@ -0,0 +1,5 @@ +# {% include 'template/license_header' %} + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training diff --git a/stack-showcase/pipelines/feature_engineering.py b/stack-showcase/pipelines/feature_engineering.py new file mode 100644 index 00000000..46f5e0cd --- /dev/null +++ b/stack-showcase/pipelines/feature_engineering.py @@ -0,0 +1,54 @@ +# {% include 'template/license_header' %} + +import random +from typing import List, Optional + +from steps import ( + data_loader, + data_preprocessor, + data_splitter, +) +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def feature_engineering( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +): + """ + Feature engineering pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + target: Name of target column in dataset + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random.randint(0, 100), target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + ) + return dataset_trn, dataset_tst diff --git a/stack-showcase/pipelines/inference.py b/stack-showcase/pipelines/inference.py new file mode 100644 index 00000000..01dc3db5 --- /dev/null +++ b/stack-showcase/pipelines/inference.py @@ -0,0 +1,52 @@ +# {% include 'template/license_header' %} + +from typing import List, Optional + +from steps import ( + data_loader, + inference_preprocessor, + inference_predict, +) +from zenml import pipeline, ExternalArtifact +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, +): + """ + Model training pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets, then search for best hyperparameters, + trains and evaluates a model. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + client = Client() + random_state = client.get_artifact("dataset").run_metadata["random_state"].value + target = "target" + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=ExternalArtifact(name="preprocess_pipeline"), + target=target, + ) + inference_predict( + dataset_inf=df_inference, + ) + ### END CODE HERE ### diff --git a/stack-showcase/pipelines/training.py b/stack-showcase/pipelines/training.py new file mode 100644 index 00000000..a82760de --- /dev/null +++ b/stack-showcase/pipelines/training.py @@ -0,0 +1,61 @@ +# {% include 'template/license_header' %} + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_trainer, model_promoter +from zenml import ExternalArtifact, pipeline +from zenml.logger import get_logger + +from pipelines import ( + feature_engineering, +) + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, +): + """ + Model training pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets, then search for best hyperparameters, + trains and evaluates a model. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering() + else: + dataset_trn = ExternalArtifact(id=train_dataset_id) + dataset_tst = ExternalArtifact(id=test_dataset_id) + + model = model_trainer( + dataset_trn=dataset_trn, + ) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + min_train_accuracy=min_train_accuracy, + min_test_accuracy=min_test_accuracy, + ) + + model_promoter(accuracy=acc) + ### END CODE HERE ### diff --git a/stack-showcase/requirements.txt b/stack-showcase/requirements.txt new file mode 100644 index 00000000..f3a893aa --- /dev/null +++ b/stack-showcase/requirements.txt @@ -0,0 +1,3 @@ +zenml[server]>=0.50.0 +notebook +scikit-learn<1.3 \ No newline at end of file diff --git a/stack-showcase/run.ipynb b/stack-showcase/run.ipynb new file mode 100644 index 00000000..69063cec --- /dev/null +++ b/stack-showcase/run.ipynb @@ -0,0 +1,981 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "081d5616", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[?25l\u001b[2;36mFound existing ZenML repository at path \u001b[0m\n", + "\u001b[2;32m'/home/apenner/PycharmProjects/template-starter/template'\u001b[0m\u001b[2;36m.\u001b[0m\n", + "\u001b[2;32m⠋\u001b[0m\u001b[2;36m Initializing ZenML repository at \u001b[0m\n", + "\u001b[2;36m/home/apenner/PycharmProjects/template-starter/template.\u001b[0m\n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠋\u001b[0m Initializing ZenML repository at \n", + "/home/apenner/PycharmProjects/template-starter/template.\n", + "\n", + "\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2K\u001b[2;36mActive repository stack set to: \u001b[0m\u001b[2;32m'default'\u001b[0m.\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Setting the repository active stack to 'default'...t'...\u001b[0m\n", + "\u001b[1A\u001b[2K" + ] + } + ], + "source": [ + "!zenml init\n", + "!zenml stack set default" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "79f775f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n" + ] + } + ], + "source": [ + "# Do the imports at the top\n", + "\n", + "import random\n", + "from zenml import ExternalArtifact, pipeline \n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "from uuid import UUID\n", + "\n", + "import os\n", + "from typing import Optional, List\n", + "\n", + "from zenml import pipeline\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " data_preprocessor,\n", + " data_splitter,\n", + " model_evaluator,\n", + " model_trainer,\n", + " inference_predict,\n", + " inference_preprocessor\n", + ")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "client = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b50a9537", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def feature_engineering(\n", + " test_size: float = 0.2,\n", + " drop_na: Optional[bool] = None,\n", + " normalize: Optional[bool] = None,\n", + " drop_columns: Optional[List[str]] = None,\n", + " target: Optional[str] = \"target\",\n", + "):\n", + " \"\"\"\n", + " Feature engineering pipeline.\n", + "\n", + " This is a pipeline that loads the data, processes it and splits\n", + " it into train and test sets.\n", + "\n", + " Args:\n", + " test_size: Size of holdout set for training 0.0..1.0\n", + " drop_na: If `True` NA values will be removed from dataset\n", + " normalize: If `True` dataset will be normalized with MinMaxScaler\n", + " drop_columns: List of columns to drop from dataset\n", + " target: Name of target column in dataset\n", + " \"\"\"\n", + " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + " raw_data = data_loader(random_state=random.randint(0, 100), target=target)\n", + " dataset_trn, dataset_tst = data_splitter(\n", + " dataset=raw_data,\n", + " test_size=test_size,\n", + " )\n", + " dataset_trn, dataset_tst, _ = data_preprocessor(\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " drop_na=drop_na,\n", + " normalize=normalize,\n", + " drop_columns=drop_columns,\n", + " target=target,\n", + " )\n", + " \n", + " return dataset_trn, dataset_tst" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bc5feef4-7016-420e-9af9-2e87ff666f74", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_args = {}\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"feature_engineering.yaml\")\n", + "fe_p_configured = feature_engineering.with_options(**pipeline_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "75cf3740-b2d8-4c4b-b91b-dc1637000880", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mfeature_engineering\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mReusing registered version: \u001b[0m\u001b[1;36m(version: 1)\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mNew model version \u001b[0m\u001b[1;36m34\u001b[1;35m was created.\u001b[0m\n", + "\u001b[1;35mExecuting a new run.\u001b[0m\n", + "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mDataset with 541 records loaded!\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has finished in \u001b[0m\u001b[1;36m6.777s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has finished in \u001b[0m\u001b[1;36m11.345s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m14.866s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mRun \u001b[0m\u001b[1;36mfeature_engineering-2023_12_06-09_08_46_821042\u001b[1;35m has finished in \u001b[0m\u001b[1;36m36.198s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/52874ade-f314-45ab-b9bf-e95fb29290b8/runs/9d9e49b1-d78f-478b-991e-da87b0560512/dag\u001b[0m\n" + ] + } + ], + "source": [ + "latest_run = fe_p_configured()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "69ade540", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def training(\n", + " train_dataset_id: Optional[UUID] = None,\n", + " test_dataset_id: Optional[UUID] = None,\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + "):\n", + " \"\"\"\n", + " Model training pipeline.\n", + "\n", + " This is a pipeline that loads the data, processes it and splits\n", + " it into train and test sets, then search for best hyperparameters,\n", + " trains and evaluates a model.\n", + "\n", + " Args:\n", + " test_size: Size of holdout set for training 0.0..1.0\n", + " drop_na: If `True` NA values will be removed from dataset\n", + " normalize: If `True` dataset will be normalized with MinMaxScaler\n", + " drop_columns: List of columns to drop from dataset\n", + " \"\"\"\n", + " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + " \n", + " # Execute Feature Engineering Pipeline\n", + " if train_dataset_id is None or test_dataset_id is None:\n", + " dataset_trn, dataset_tst = feature_engineering()\n", + " else:\n", + " dataset_trn = ExternalArtifact(id=train_dataset_id)\n", + " dataset_tst = ExternalArtifact(id=test_dataset_id)\n", + " \n", + " model = model_trainer(\n", + " dataset_trn=dataset_trn,\n", + " )\n", + "\n", + " model_evaluator(\n", + " model=model,\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " min_train_accuracy=min_train_accuracy,\n", + " min_test_accuracy=min_test_accuracy,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5b1f78df", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_args = {}\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"training.yaml\")\n", + "fe_t_configured = training.with_options(**pipeline_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "acf306a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mtraining\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mRegistered new version: \u001b[0m\u001b[1;36m(version 2)\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mNew model version \u001b[0m\u001b[1;36m35\u001b[1;35m was created.\u001b[0m\n", + "\u001b[1;35mExecuting a new run.\u001b[0m\n", + "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mDataset with 541 records loaded!\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has finished in \u001b[0m\u001b[1;36m7.368s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has finished in \u001b[0m\u001b[1;36m11.009s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m14.134s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mCaching \u001b[0m\u001b[1;36mdisabled\u001b[1;35m explicitly for \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mTraining model DecisionTreeClassifier()...\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m has finished in \u001b[0m\u001b[1;36m7.035s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_evaluator\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mTrain accuracy=100.00%\u001b[0m\n", + "\u001b[1;35mTest accuracy=92.66%\u001b[0m\n", + "\u001b[1;35mImplicitly linking artifact \u001b[0m\u001b[1;36moutput\u001b[1;35m to model \u001b[0m\u001b[1;36mbreast_cancer_classifier\u001b[1;35m version \u001b[0m\u001b[1;36m35\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_evaluator\u001b[1;35m has finished in \u001b[0m\u001b[1;36m6.050s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mRun \u001b[0m\u001b[1;36mtraining-2023_12_06-09_09_41_413455\u001b[1;35m has finished in \u001b[0m\u001b[1;36m51.278s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/787c6360-4499-4e2e-8d50-edaaa3956a6f/runs/2a335b9c-bb8e-425c-80e2-0a6cc0ffe56a/dag\u001b[0m\n" + ] + } + ], + "source": [ + "fe_t_configured()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad6aa280", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "\n", + "import pandas as pd\n", + "from typing_extensions import Annotated\n", + "\n", + "from zenml import get_step_context, step\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "\n", + "@step\n", + "def inference_predict(\n", + " dataset_inf: pd.DataFrame,\n", + ") -> Annotated[pd.Series, \"predictions\"]:\n", + " \"\"\"Predictions step.\n", + "\n", + " This is an example of a predictions step that takes the data in and returns\n", + " predicted values.\n", + "\n", + " This step is parameterized, which allows you to configure the step\n", + " independently of the step code, before running it in a pipeline.\n", + " In this example, the step can be configured to use different input data.\n", + " See the documentation for more information:\n", + "\n", + " https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines\n", + "\n", + " Args:\n", + " dataset_inf: The inference dataset.\n", + "\n", + " Returns:\n", + " The predictions as pandas series\n", + " \"\"\"\n", + " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", + " model_version = get_step_context().model_version\n", + "\n", + " print(model_version)\n", + "\n", + " # run prediction from memory\n", + " predictor = model_version.load_artifact(\"model\")\n", + " predictions = predictor.predict(dataset_inf)\n", + "\n", + " print(predictions)\n", + " predictions = pd.Series(predictions, name=\"predicted\")\n", + " ### YOUR CODE ENDS HERE ###\n", + "\n", + " return predictions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "517ad39d", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def batch_inference():\n", + " \"\"\"\n", + " Model batch inference pipeline.\n", + "\n", + " This is a pipeline that loads the inference data, processes\n", + " it, analyze for data drift and run inference.\n", + " \"\"\"\n", + " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + " ########## ETL stage ##########\n", + " random_state = client.get_artifact(\"dataset\").run_metadata[\"random_state\"].value\n", + " target = client.get_artifact(\"dataset_trn\").run_metadata['target'].value\n", + " df_inference = data_loader(\n", + " random_state=random_state, is_inference=True\n", + " )\n", + " df_inference = inference_preprocessor(\n", + " dataset_inf=df_inference,\n", + " preprocess_pipeline=ExternalArtifact(name=\"preprocess_pipeline\"),\n", + " target=target,\n", + " )\n", + " inference_predict(\n", + " dataset_inf=df_inference,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f0d9ebb6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35m\u001b[0m\u001b[1;36mversion\u001b[1;35m \u001b[0m\u001b[1;36mproduction\u001b[1;35m matches one of the possible \u001b[0m\u001b[1;36mModelStages\u001b[1;35m and will be fetched using stage.\u001b[0m\n" + ] + } + ], + "source": [ + "pipeline_args = {}\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"inference.yaml\")\n", + "fe_b_configured = batch_inference.with_options(**pipeline_args)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9901c6d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.\u001b[0m\n", + "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mbatch_inference\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mReusing registered version: \u001b[0m\u001b[1;36m(version: 1)\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mExecuting a new run.\u001b[0m\n", + "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mUsing cached version of \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_preprocessor\u001b[1;35m has started.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m8.661s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_predict\u001b[1;35m has started.\u001b[0m\n", + "name='breast_cancer_classifier' license='Apache 2.0' description='Classification of Breast Cancer Dataset.' audience=None use_cases=None limitations=None trade_offs=None ethics=None tags=['classification', 'sklearn'] version='production' save_models_to_registry=True suppress_class_validation_warnings=True was_created_in_this_run=False\n", + "\u001b[33mYou specified both an ID as well as a version of the artifacts. Ignoring the version and fetching the artifacts by ID.\u001b[0m\n", + "\u001b[33mYour artifact was materialized under Python version 'unknown' but you are currently using '3.9.13'. This might cause unexpected behavior since pickle is not reproducible across Python versions. Attempting to load anyway...\u001b[0m\n", + "\u001b[33mCould not import Azure service connector: No module named 'azure.identity'.\u001b[0m\n", + "[1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 1 1 1]\n", + "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_predict\u001b[1;35m has finished in \u001b[0m\u001b[1;36m18.218s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mRun \u001b[0m\u001b[1;36mbatch_inference-2023_12_06-09_11_29_924914\u001b[1;35m has finished in \u001b[0m\u001b[1;36m32.726s\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/2979acb2-c862-480a-8f50-a2be4c76a8a2/runs/7886e370-b05a-4205-931e-e4994fabd897/dag\u001b[0m\n" + ] + } + ], + "source": [ + "fe_b_configured()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98d39df8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "51690802-31a7-4e6d-9f88-e6457c6c4a96", + "metadata": {}, + "source": [ + "# Huggingface Model to Sagemaker Endpoint: Automating MLOps with ZenML\n", + "Deploying Huggingface models to AWS Sagemaker endpoints typically only requires a few lines of code. However, there's a growing demand to not just deploy, but to seamlessly automate the entire flow from training to production with comprehensive lineage tracking. ZenML adeptly fills this niche, providing an end-to-end MLOps solution for Huggingface users wishing to deploy to Sagemaker. Below, we’ll walk through the architecture that ZenML employs to bring a Huggingface model into production with AWS Sagemaker. Of course all of this can be adapted to not just Sagemaker, but any other model deployment service like GCP Vertex or Azure ML Platform.\n", + "\n", + "This blog post showcases one way of using ZenML pipelines to achieve this:\n", + "\n", + "- Create and version a dataset in a feature_engineering_pipeline.\n", + "- Train/Finetune a BERT-based Sentiment Analysis NLP model and push to Huggingface Hub in a training_pipeline.\n", + "- Promote this model to Production by comparing to previous models in a promotion_pipeline.\n", + "- Deploy the model at the Production Stage to a AWS Sagemaker endpoint with a deployment_pipeline.\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "500e3c24-b105-4a69-b2fc-e0ce1f1c1d46", + "metadata": {}, + "outputs": [], + "source": [ + "# Do the imports at the top\n", + "\n", + "import numpy as np\n", + "from datasets import DatasetDict, load_dataset\n", + "from typing_extensions import Annotated\n", + "from zenml import step\n", + "from zenml.logger import get_logger\n", + "\n", + "import os\n", + "from typing import Optional\n", + "from datetime import datetime as dt\n", + "\n", + "from zenml import pipeline\n", + "from zenml.model import ModelConfig\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " notify_on_failure,\n", + " tokenization_step,\n", + " tokenizer_loader,\n", + " generate_reference_and_comparison_datasets,\n", + ")\n", + "from zenml.integrations.evidently.metrics import EvidentlyMetricConfig\n", + "from zenml.integrations.evidently.steps import (\n", + " EvidentlyColumnMapping,\n", + " evidently_report_step,\n", + ")\n", + "\n", + "from pipelines import (\n", + " sentinment_analysis_deploy_pipeline,\n", + " sentinment_analysis_promote_pipeline,\n", + " sentinment_analysis_training_pipeline,\n", + ")\n", + "\n", + "logger = get_logger(__name__)" + ] + }, + { + "cell_type": "markdown", + "id": "fc77b660-e206-46b1-a924-407e797a8f47", + "metadata": {}, + "source": [ + "# 🍳Breaking it down\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "31edaf46-6981-42be-99b7-9bdd91c160d5", + "metadata": {}, + "source": [ + "## 👶 Step 1: Start with feature engineering\n", + "\n", + "Automated feature engineering forms the foundation of this MLOps workflow. Thats why the first pipeline is the feature engineering pipeline. This pipeline loads some data from Huggingface and uses a base tokenizer to create a tokenized dataset. The data loader step is a simple Python function that returns a Huggingface dataloader object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35de0e4c-b6f8-4b68-927a-f40e4130dc93", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def data_loader() -> Annotated[DatasetDict, \"dataset\"]:\n", + " logger.info(f\"Loading dataset airline_reviews... \")\n", + " hf_dataset = load_dataset(\"Shayanvsf/US_Airline_Sentiment\")\n", + " hf_dataset = hf_dataset.rename_column(\"airline_sentiment\", \"label\")\n", + " hf_dataset = hf_dataset.remove_columns(\n", + " [\"airline_sentiment_confidence\", \"negativereason_confidence\"]\n", + " )\n", + " return hf_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "49e4462c-1e64-48d3-bae7-76696a958646", + "metadata": {}, + "source": [ + "Notice that you can give each dataset a name with Python’s Annotated object. The DatasetDict is a native Huggingface dataset which ZenML knows how to persist through steps. This flow ensures reproducibility and version control for every dataset iteration.\n", + "\n", + "Also notice this is a simple Python function, that can be called with the `entrypoint` wrapper:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18144a6b-c266-453d-82c8-b5d6aa1be0aa", + "metadata": {}, + "outputs": [], + "source": [ + "hf_dataset = data_loader.entrypoint()\n", + "print(hf_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "31330d3c-044f-4912-8d36-74146f48cecf", + "metadata": {}, + "source": [ + "Now we put this a full feature engineering pipeline. Each run of the feature engineering pipeline produces a new dataset to use for the training pipeline. ZenML versions this data as it flows through the pipeline.\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "9511bd84-1e97-42db-9b75-06285cc6904c", + "metadata": {}, + "source": [ + "### Set your stack" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76f3a7e7-0d85-43b3-9e9f-4c7f20ea65e6", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack describe hf-sagemaker-local" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04b0bf69-70c6-4408-b18c-95df9e030c0c", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set hf-sagemaker-local" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5398a4-a9ec-42d6-bbd6-390244c52d13", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack get" + ] + }, + { + "cell_type": "markdown", + "id": "152f718d-70c2-4a29-a73e-37db85675cb8", + "metadata": {}, + "source": [ + "### Run the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ca6c41e-e4b3-46d2-8264-9a453ac9aa3c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "@pipeline(on_failure=notify_on_failure)\n", + "def sentinment_analysis_feature_engineering_pipeline(\n", + " lower_case: Optional[bool] = True,\n", + " padding: Optional[str] = \"max_length\",\n", + " max_seq_length: Optional[int] = 128,\n", + " text_column: Optional[str] = \"text\",\n", + " label_column: Optional[str] = \"label\",\n", + "):\n", + " # Link all the steps together by calling them and passing the output\n", + " # of one step as the input of the next step.\n", + "\n", + " ########## Load Dataset stage ##########\n", + " dataset = data_loader()\n", + "\n", + " ########## Data Quality stage ##########\n", + " reference_dataset, comparison_dataset = generate_reference_and_comparison_datasets(\n", + " dataset\n", + " )\n", + " text_data_report = evidently_report_step.with_options(\n", + " parameters=dict(\n", + " column_mapping=EvidentlyColumnMapping(\n", + " target=\"label\",\n", + " text_features=[\"text\"],\n", + " ),\n", + " metrics=[\n", + " EvidentlyMetricConfig.metric(\"DataQualityPreset\"),\n", + " EvidentlyMetricConfig.metric(\n", + " \"TextOverviewPreset\", column_name=\"text\"\n", + " ),\n", + " ],\n", + " # We need to download the NLTK data for the TextOverviewPreset\n", + " download_nltk_data=True,\n", + " ),\n", + " )\n", + " text_data_report(reference_dataset, comparison_dataset)\n", + "\n", + " ########## Tokenization stage ##########\n", + " tokenizer = tokenizer_loader(lower_case=lower_case)\n", + " tokenized_data = tokenization_step(\n", + " dataset=dataset,\n", + " tokenizer=tokenizer,\n", + " padding=padding,\n", + " max_seq_length=max_seq_length,\n", + " text_column=text_column,\n", + " label_column=label_column,\n", + " )\n", + " return tokenizer, tokenized_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c8a5be7-ebaa-41c4-ac23-4afc6e7e06aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Run a pipeline with the required parameters. \n", + "no_cache: bool = True\n", + "zenml_model_name: str = \"distil_bert_sentiment_analysis\"\n", + "max_seq_length = 512\n", + "\n", + "# This executes all steps in the pipeline in the correct order using the orchestrator\n", + "# stack component that is configured in your active ZenML stack.\n", + "model_config = ModelConfig(\n", + " name=zenml_model_name,\n", + " license=\"Apache 2.0\",\n", + " description=\"Show case Model Control Plane.\",\n", + " create_new_model_version=True,\n", + " delete_new_version_on_failure=True,\n", + " tags=[\"sentiment_analysis\", \"huggingface\"],\n", + ")\n", + "\n", + "pipeline_args = {}\n", + "\n", + "if no_cache:\n", + " pipeline_args[\"enable_cache\"] = False\n", + "\n", + "# Execute Feature Engineering Pipeline\n", + "pipeline_args[\"model_config\"] = model_config\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"feature_engineering_config.yaml\")\n", + "run_args_feature = {\n", + " \"max_seq_length\": max_seq_length,\n", + "}\n", + "pipeline_args[\n", + " \"run_name\"\n", + "] = f\"sentinment_analysis_feature_engineering_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"\n", + "p = sentinment_analysis_feature_engineering_pipeline.with_options(**pipeline_args)\n", + "p(**run_args_feature)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e7c1ea2-64fe-478a-9963-17c7b7f62110", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.client import Client\n", + "from IPython.display import display, HTML\n", + "\n", + "client = Client()\n", + "# CHANGE THIS TO THE LATEST RUN ID\n", + "latest_run = client.get_pipeline_run(\"sentinment_analysis_feature_engineering_pipeline_run_2023_11_21_10_55_56\")\n", + "html = latest_run.steps[\"evidently_report_step\"].outputs['report_html'].load()\n", + "display(HTML(html))" + ] + }, + { + "cell_type": "markdown", + "id": "78ab8771-4421-4975-a3d5-12892a56b805", + "metadata": {}, + "source": [ + "## 💪 Step 2: Train the model with Huggingface Hub as the model registry\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "2843efa8-32b6-4b13-ac85-33c99cc94e3e", + "metadata": {}, + "source": [ + "Once the feature engineering pipeline has run a few times, we have many datasets to choose from. We can feed our desired one into a function that trains the model on the data. Thanks to the ZenML Huggingface integration, this data is loaded directly from the ZenML artifact store.\n", + "\n", + "\"Pipelines\n", + "\n", + "On the left side, we see our local MLOps stack, which defines our infrastructure and tooling we are using for this particular pipeline. ZenML makes it easy to run on a local stack on your development machine, or switch out the stack to run on a AWS Kubeflow-based stack (if you want to scale up).\n", + "\n", + "On the right side is the new kid on the block - the ZenML Model Control Plane. The Model Control Plane is a new feature in ZenML that allows users to have a complete overview of their machine learning models. It allows teams to consolidate all artifacts related to their ML models into one place, and manage its lifecycle easily as you can see from this view from the ZenML Cloud:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c99b20f-8e3b-4119-86e9-33dd1395470a", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"trainer_config.yaml\")\n", + "\n", + "pipeline_args[\"enable_cache\"] = True\n", + "\n", + "run_args_train = {\n", + " \"num_epochs\": 1,\n", + " \"train_batch_size\": 64,\n", + " \"eval_batch_size\": 64,\n", + " \"learning_rate\": 2e-4,\n", + " \"weight_decay\": 0.01,\n", + " \"max_seq_length\": 512,\n", + "}\n", + "\n", + "# Use versioned artifacts from the last step\n", + "# run_args_train[\"dataset_artifact_id\"] = latest_run.steps['tokenization_step'].output.id\n", + "# run_args_train[\"tokenizer_artifact_id\"] = latest_run.steps['tokenizer_loader'].output.id\n", + "\n", + "# Configure the model\n", + "pipeline_args[\"model_config\"] = model_config\n", + "\n", + "pipeline_args[\n", + " \"run_name\"\n", + "] = f\"sentinment_analysis_training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96592299-0090-4d2a-962e-6ca232c1fb75", + "metadata": {}, + "outputs": [], + "source": [ + "sentinment_analysis_training_pipeline.with_options(**pipeline_args)(\n", + " **run_args_train\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e24e29de-6d1b-41da-9ab2-ca2b32f1f540", + "metadata": {}, + "outputs": [], + "source": [ + "### Check out a new stack\n", + "!zenml stack describe hf-sagemaker-airflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9a5bee-8465-4d41-888a-093f1f6a2ef1", + "metadata": {}, + "outputs": [], + "source": [ + "### Change the stack\n", + "!zenml stack set hf-sagemaker-airflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3772c50-1c90-4ffc-8394-c9cfca16cc53", + "metadata": {}, + "outputs": [], + "source": [ + "sentinment_analysis_training_pipeline.with_options(**pipeline_args)(\n", + " **run_args_train\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "be79f454-a45d-4f5f-aa93-330d52069124", + "metadata": {}, + "source": [ + "## 🫅 Step 3: Promote the model to production\n" + ] + }, + { + "cell_type": "markdown", + "id": "5a09b432-7a66-473e-bdb6-ffdca730498b", + "metadata": {}, + "source": [ + "Following training, the automated promotion pipeline evaluates models against predefined metrics, identifying and marking the most performant one as 'Production ready'. This is another common use case for the Model Control Plane; we store the relevant metrics there to access them easily later.\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bac7ae5-70d0-449c-929c-e175c3062f2d", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set hf-sagemaker-local" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "170c9ef6-4e6f-4e50-ac37-e05bef8570ea", + "metadata": {}, + "outputs": [], + "source": [ + "run_args_promoting = {}\n", + "model_config = ModelConfig(name=zenml_model_name)\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"promoting_config.yaml\")\n", + "\n", + "pipeline_args[\"model_config\"] = model_config\n", + "\n", + "pipeline_args[\n", + " \"run_name\"\n", + "] = f\"sentinment_analysis_promoting_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6df11e2-4591-4186-a8f8-243f9c4d1e3d", + "metadata": {}, + "outputs": [], + "source": [ + "sentinment_analysis_promote_pipeline.with_options(**pipeline_args)(\n", + " **run_args_promoting\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6efc4968-35fd-42e3-ba62-d8e1557aa0d6", + "metadata": {}, + "source": [ + "## 💯 Step 4: Deploy the model to AWS Sagemaker Endpoints\n" + ] + }, + { + "cell_type": "markdown", + "id": "577aff86-bde9-48d4-9b52-209cfed9fd4e", + "metadata": {}, + "source": [ + "This is the final step to automate the deployment of the slated production model to a Sagemaker endpoint. The deployment pipelines handles the complexities of AWS interactions and ensures that the model, along with its full history and context, is transitioned into a live environment ready for use. Here again we use the Model Control Plane interface to query the Huggingface revision and use that information to push to Huggingface Hub.\n", + "\n", + "\"Pipelines\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1513ab5f-de05-4344-9d2c-fedbfbd21ef0", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set hf-sagemaker-local" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "606fdb3c-4eca-4d32-bccb-280743d15528", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"deploying_config.yaml\")\n", + "\n", + "# Deploying pipeline has new ZenML model config\n", + "model_config = ModelConfig(\n", + " name=zenml_model_name,\n", + " version=ModelStages.PRODUCTION,\n", + ")\n", + "pipeline_args[\"model_config\"] = model_config\n", + "pipeline_args[\"enable_cache\"] = False\n", + "run_args_deploying = {}\n", + "pipeline_args[\n", + " \"run_name\"\n", + "] = f\"sentinment_analysis_deploy_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87f1f982-ab96-4207-8e7e-e318473587e9", + "metadata": {}, + "outputs": [], + "source": [ + "sentinment_analysis_deploy_pipeline.with_options(**pipeline_args)(\n", + " **run_args_deploying\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "ZenML builds upon the straightforward deployment capability of Huggingface models to AWS Sagemaker, and transforms it into a sophisticated, repeatable, and transparent MLOps workflow. It takes charge of the intricate steps necessary for modern ML systems, ensuring that software engineering leads can focus on iteration and innovation rather than operational intricacies.\n", + "\n", + "To delve deeper into each stage, refer to the comprehensive guide on GitHub[: zenml-io/zenml-huggingface-sagemak](https://github.com/zenml-io/zenml-huggingface-sagemaker)er. Additionally[, this YouTube playli](https://www.youtube.com/watch?v=Q1EH2H8Akgo&list=PLhNrLW_IWplw6dBbmGcL828-atJMu3CwF)st provides a detailed visual walkthrough of the entire pipeline: Huggingface to Sagemaker ZenML tutorial.\n", + "\n", + "Interested in standardizing your MLOps workflows? ZenML Cloud is now available to all - get a managed ZenML server with important features such as RBAC and pipeline trigge[rs. Book a ](https://zenml.io/book-a-demo)demo with us now to learn how you can create your own MLOps pipelines today." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/stack-showcase/run.py b/stack-showcase/run.py new file mode 100644 index 00000000..f5c555f3 --- /dev/null +++ b/stack-showcase/run.py @@ -0,0 +1,159 @@ +# {% include 'templates/license_header' %} + +import os +from typing import Optional + +import click +from pipelines import ( + feature_engineering, + inference, + training, +) +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@click.command( + help=""" +ZenML Starter project CLI v0.0.1. + +Run the ZenML starter project with basic options. + +Examples: + + \b + # Run the feature engineering pipeline + python run.py --feature-pipeline + + \b + # Run the training pipeline + python run.py --training-pipeline + + \b + # Run the training pipeline with versioned artifacts + python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 + + \b + # Run the inference pipeline + python run.py --inference-pipeline + +""" +) +@click.option( + "--train-dataset-name", + default="dataset_trn", + type=click.STRING, + help="The name of the train dataset produced by feature engineering.", +) +@click.option( + "--train-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the train dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--test-dataset-name", + default="dataset_tst", + type=click.STRING, + help="The name of the test dataset produced by feature engineering.", +) +@click.option( + "--test-dataset-version-name", + default=None, + type=click.STRING, + help="Version of the test dataset produced by feature engineering. " + "If not specified, a new version will be created.", +) +@click.option( + "--feature-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that creates the dataset.", +) +@click.option( + "--training-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that trains the model.", +) +@click.option( + "--inference-pipeline", + is_flag=True, + default=False, + help="Whether to run the pipeline that performs inference.", +) +def main( + train_dataset_name: str = "dataset_trn", + train_dataset_version_name: Optional[str] = None, + test_dataset_name: str = "dataset_tst", + test_dataset_version_name: Optional[str] = None, + feature_pipeline: bool = False, + training_pipeline: bool = False, + inference_pipeline: bool = False, +): + """Main entry point for the pipeline execution. + + This entrypoint is where everything comes together: + + * configuring pipeline with the required parameters + (some of which may come from command line arguments, but most + of which comes from the YAML config files) + * launching the pipeline + """ + config_folder = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "configs", + ) + + # Execute Feature Engineering Pipeline + if feature_pipeline: + pipeline_args = {} + pipeline_args["config_path"] = os.path.join( + config_folder, "feature_engineering.yaml" + ) + run_args_feature = {} + feature_engineering.with_options(**pipeline_args)(**run_args_feature) + logger.info("Feature Engineering pipeline finished successfully!") + + # Execute Training Pipeline + if training_pipeline: + pipeline_args = {} + pipeline_args["config_path"] = os.path.join(config_folder, "training.yaml") + + run_args_train = {} + + # If train_dataset_version_name is specified, use versioned artifacts + if train_dataset_version_name or test_dataset_version_name: + # However, both train and test dataset versions must be specified + assert ( + train_dataset_version_name is not None + and test_dataset_version_name is not None + ) + client = Client() + train_dataset_artifact = client.get_artifact( + train_dataset_name, train_dataset_version_name + ) + # If train dataset is specified, test dataset must be specified + test_dataset_artifact = client.get_artifact( + test_dataset_name, test_dataset_version_name + ) + # Use versioned artifacts + run_args_train["train_dataset_id"] = train_dataset_artifact.id + run_args_train["test_dataset_id"] = test_dataset_artifact.id + + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline finished successfully!") + + if inference_pipeline: + pipeline_args = {} + pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml") + run_args_inference = {} + inference.with_options(**pipeline_args)(**run_args_inference) + logger.info("Inference pipeline finished successfully!") + + +if __name__ == "__main__": + main() diff --git a/stack-showcase/run_stack_showcase.ipynb b/stack-showcase/run_stack_showcase.ipynb new file mode 100644 index 00000000..769f2109 --- /dev/null +++ b/stack-showcase/run_stack_showcase.ipynb @@ -0,0 +1,827 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c53367f1-3951-48c7-9540-21daf818fa5d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n" + ] + } + ], + "source": [ + "# Do the imports at the top\n", + "\n", + "import random\n", + "from zenml import ExternalArtifact, pipeline \n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "from uuid import UUID\n", + "\n", + "import os\n", + "from typing import Optional, List\n", + "\n", + "from zenml import pipeline\n", + "\n", + "from pipelines import feature_engineering\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " data_preprocessor,\n", + " data_splitter,\n", + " model_evaluator,\n", + " model_trainer,\n", + " inference_predict,\n", + " inference_preprocessor\n", + ")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "client = Client()" + ] + }, + { + "cell_type": "markdown", + "id": "ab87746e-b804-4fab-88f6-d4967048cb45", + "metadata": {}, + "source": [ + "# Start local with a simple training pipeline\n", + "\n", + "First, lets run our training pipeline locally" + ] + }, + { + "cell_type": "markdown", + "id": "33872b19-7329-4f5e-9a1e-cfc1fe9d560d", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "\"Drawing\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b567a1d3-f625-4b98-9852-fcc3f3fe9609", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[?25l\u001b[2;36mFound existing ZenML repository at path \u001b[0m\n", + "\u001b[2;32m'/home/apenner/PycharmProjects/zenml-projects/stack-showcase'\u001b[0m\u001b[2;36m.\u001b[0m\n", + "\u001b[2;32m⠋\u001b[0m\u001b[2;36m Initializing ZenML repository at \u001b[0m\n", + "\u001b[2;36m/home/apenner/PycharmProjects/zenml-projects/stack-showcase.\u001b[0m\n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠋\u001b[0m Initializing ZenML repository at \n", + "/home/apenner/PycharmProjects/zenml-projects/stack-showcase.\n", + "\n", + "\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2K\u001b[2;36mActive repository stack set to: \u001b[0m\u001b[2;32m'default'\u001b[0m.\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Setting the repository active stack to 'default'...t'...\u001b[0m\n", + "\u001b[1A\u001b[2K" + ] + } + ], + "source": [ + "# To start with, we use the default stack\n", + "!zenml init\n", + "!zenml stack set default\n", + "\n", + "# We also need to connect to a remote ZenML Instance\n", + "# !zenml connect --url ..." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06625571-b281-4820-a7eb-3a085ba2e572", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.datasets import load_breast_cancer\n", + "from zenml import step\n", + "from zenml.logger import get_logger\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "\n", + "@step\n", + "def data_loader() -> pd.DataFrame:\n", + " \"\"\"Dataset reader step.\"\"\"\n", + " dataset = load_breast_cancer(as_frame=True)\n", + " inference_size = int(len(dataset.target) * 0.05)\n", + " dataset: pd.DataFrame = dataset.frame\n", + " dataset.reset_index(drop=True, inplace=True)\n", + " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", + "\n", + " ### YOUR CODE ENDS HERE ###\n", + " return dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "754a3069-9d13-4869-be64-a641071800cc", + "metadata": {}, + "outputs": [], + "source": [ + "data_loader()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8aa300f1-48df-4e62-87eb-0e2fc5735da8", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.config import DockerSettings\n", + "\n", + "docker_settings = DockerSettings(\n", + " requirements=[\n", + " \"pyarrow\",\n", + " ],\n", + ")\n", + "\n", + "@pipeline(settings={\"docker\": docker_settings})\n", + "def training(\n", + " train_dataset_id: Optional[UUID] = None,\n", + " test_dataset_id: Optional[UUID] = None,\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + "):\n", + " \"\"\"Model training pipeline.\"\"\"\n", + " # Execute Feature Engineering Pipeline\n", + " dataset_trn, dataset_tst = feature_engineering()\n", + "\n", + " model = model_trainer(\n", + " dataset_trn=dataset_trn,\n", + " )\n", + "\n", + " model_evaluator(\n", + " model=model,\n", + " dataset_trn=dataset_trn,\n", + " dataset_tst=dataset_tst,\n", + " min_train_accuracy=min_train_accuracy,\n", + " min_test_accuracy=min_test_accuracy,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d55342bf-33c5-4646-b1ce-e599a99cf568", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_args = {\"enable_cache\": False}\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"training.yaml\")\n", + "fe_t_configured = training.with_options(**pipeline_args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5f4aed8-7d87-4e07-a25c-345d327ad636", + "metadata": {}, + "outputs": [], + "source": [ + "fe_t_configured()" + ] + }, + { + "cell_type": "markdown", + "id": "c3e6dc42-21b8-4b3c-90ec-d6e6d541907f", + "metadata": {}, + "source": [ + "# Let's outsource some compute to Sagemaker!" + ] + }, + { + "cell_type": "markdown", + "id": "14a840b1-288d-4713-98f4-bbe8d6e06140", + "metadata": {}, + "source": [ + "Let's farm some compute to AWS with a training job with a certain number of CPUs and Memory" + ] + }, + { + "cell_type": "markdown", + "id": "fa9308fb-3556-472c-8fc7-7f2f88d1c455", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\"Drawing\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "48be8f60-9fbe-4d19-92e4-d9cd8289dbf7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2K\u001b[32m⠧\u001b[0m Installing integrations...Requirement already satisfied: sagemaker==2.117.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (2.117.0)\n", + "Requirement already satisfied: kubernetes in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (25.3.0)\n", + "Requirement already satisfied: aws-profile-manager in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (0.7.3)\n", + "Requirement already satisfied: s3fs<=2023.4.0,>2022.3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (2023.4.0)\n", + "Requirement already satisfied: boto3<=1.26.76 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (1.26.76)\n", + "Requirement already satisfied: protobuf3-to-dict<1.0,>=0.1.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (0.1.5)\n", + "Requirement already satisfied: packaging>=20.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (23.2)\n", + "Requirement already satisfied: importlib-metadata<5.0,>=1.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (4.13.0)\n", + "Requirement already satisfied: protobuf<4.0,>=3.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (3.20.3)\n", + "Requirement already satisfied: schema in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (0.7.5)\n", + "Requirement already satisfied: pandas in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (1.5.3)\n", + "\u001b[2K\u001b[32m⠏\u001b[0m Installing integrations...Collecting attrs<23,>=20.3.0\n", + " Using cached attrs-22.2.0-py3-none-any.whl (60 kB)\n", + "Requirement already satisfied: pathos in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (0.3.1)\n", + "Requirement already satisfied: smdebug-rulesconfig==1.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (1.0.1)\n", + "Requirement already satisfied: numpy<2.0,>=1.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (1.23.5)\n", + "Requirement already satisfied: google-pasta in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sagemaker==2.117.0) (0.2.0)\n", + "Requirement already satisfied: pyyaml>=5.4.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (6.0.1)\n", + "Requirement already satisfied: requests-oauthlib in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (1.3.1)\n", + "Requirement already satisfied: urllib3>=1.24.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (1.26.18)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (2.8.2)\n", + "Requirement already satisfied: requests in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (2.31.0)\n", + "Requirement already satisfied: google-auth>=1.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (2.23.3)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (1.6.4)\n", + "Requirement already satisfied: six>=1.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (1.16.0)\n", + "Requirement already satisfied: setuptools>=21.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (69.0.2)\n", + "Requirement already satisfied: certifi>=14.05.14 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from kubernetes) (2023.7.22)\n", + "\u001b[2K\u001b[32m⠋\u001b[0m Installing integrations...Collecting argparse\n", + " Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)\n", + "Requirement already satisfied: click in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aws-profile-manager) (8.1.3)\n", + "Requirement already satisfied: configparser in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aws-profile-manager) (6.0.0)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from s3fs<=2023.4.0,>2022.3.0) (3.8.6)\n", + "Requirement already satisfied: fsspec==2023.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from s3fs<=2023.4.0,>2022.3.0) (2023.4.0)\n", + "Requirement already satisfied: aiobotocore~=2.5.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from s3fs<=2023.4.0,>2022.3.0) (2.5.2)\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Installing integrations...Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from boto3<=1.26.76) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from boto3<=1.26.76) (0.6.2)\n", + "Requirement already satisfied: botocore<1.30.0,>=1.29.76 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from boto3<=1.26.76) (1.29.161)\n", + "Requirement already satisfied: aioitertools<1.0.0,>=0.5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiobotocore~=2.5.0->s3fs<=2023.4.0,>2022.3.0) (0.11.0)\n", + "Requirement already satisfied: wrapt<2.0.0,>=1.10.10 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiobotocore~=2.5.0->s3fs<=2023.4.0,>2022.3.0) (1.14.1)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (1.4.0)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (1.9.2)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (3.3.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (1.3.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs<=2023.4.0,>2022.3.0) (6.0.4)\n", + "\u001b[2K\u001b[32m⠹\u001b[0m Installing integrations...Requirement already satisfied: cachetools<6.0,>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from google-auth>=1.0.1->kubernetes) (5.3.1)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from google-auth>=1.0.1->kubernetes) (4.9)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from google-auth>=1.0.1->kubernetes) (0.3.0)\n", + "Requirement already satisfied: zipp>=0.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from importlib-metadata<5.0,>=1.4.0->sagemaker==2.117.0) (3.17.0)\n", + "\u001b[2K\u001b[32m⠸\u001b[0m Installing integrations...Requirement already satisfied: pytz>=2020.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pandas->sagemaker==2.117.0) (2023.3.post1)\n", + "\u001b[2K\u001b[32m⠼\u001b[0m Installing integrations...Requirement already satisfied: dill>=0.3.7 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pathos->sagemaker==2.117.0) (0.3.7)\n", + "Requirement already satisfied: ppft>=1.7.6.7 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pathos->sagemaker==2.117.0) (1.7.6.7)\n", + "Requirement already satisfied: multiprocess>=0.70.15 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pathos->sagemaker==2.117.0) (0.70.15)\n", + "Requirement already satisfied: pox>=0.3.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pathos->sagemaker==2.117.0) (0.3.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from requests->kubernetes) (3.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from requests-oauthlib->kubernetes) (3.2.2)\n", + "Requirement already satisfied: contextlib2>=0.5.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from schema->sagemaker==2.117.0) (21.6.0)\n", + "Requirement already satisfied: typing_extensions>=4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aioitertools<1.0.0,>=0.5.1->aiobotocore~=2.5.0->s3fs<=2023.4.0,>2022.3.0) (4.8.0)\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Installing integrations...Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.5.0)\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Installing integrations...Installing collected packages: argparse, attrs\n", + " Attempting uninstall: attrs\n", + " Found existing installation: attrs 23.1.0\n", + "\u001b[2K\u001b[32m⠦\u001b[0m Installing integrations... Uninstalling attrs-23.1.0:\n", + " Successfully uninstalled attrs-23.1.0\n", + "\u001b[2K\u001b[32m⠧\u001b[0m Installing integrations...\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavior is the source of the following dependency conflicts.\n", + "cattrs 23.2.3 requires attrs>=23.1.0, but you have attrs 22.2.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed argparse-1.4.0 attrs-22.2.0\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.3.1 is available.\n", + "You should consider upgrading via the '/home/apenner/.pyenv/versions/3.9.13/envs/demo/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Installing integrations...\n", + "\u001b[1A\u001b[2K\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[?25l\u001b[32m⠋\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠙\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠹\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠸\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠼\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠴\u001b[0m Setting the repository active stack to \n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[2;36mActive repository stack set to: \u001b[0m\u001b[2;32m'local-sagemaker-step-operator-stack'\u001b[0m\n", + "\u001b[2;32m⠴\u001b[0m\u001b[2;36m Setting the repository active stack to \u001b[0m\n", + "\u001b[2K\u001b[1A\u001b[2K\u001b[32m⠴\u001b[0m Setting the repository active stack to \n", + "'local-sagemaker-step-operator-stack'...\n", + "\u001b[1A\u001b[2K\u001b[1A\u001b[2K" + ] + } + ], + "source": [ + "# This pip installs the requirements locally\n", + "!zenml integration install aws s3 -y\n", + "\n", + "# This changes the active stack\n", + "!zenml stack set local-sagemaker-step-operator-stack" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5683a1c9-f5c1-4ba1-ad7c-1e427fd265df", + "metadata": {}, + "outputs": [], + "source": [ + "step_args = {}\n", + "step_args[\"step_operator\"] = \"sagemaker-eu\"\n", + "\n", + "# M5 Large is what we need for this big data!\n", + "step_args[\"settings\"] = {\"step_operator.sagemaker\": {\"estimator_args\": {\"instance_type\" : \"ml.m5.large\"}}}\n", + "\n", + "# Update the step. We can also do this in YAML\n", + "model_trainer = model_trainer.with_options(**step_args)\n", + " \n", + "pipeline_args = {\"enable_cache\": False}\n", + "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"training.yaml\")\n", + "fe_t_configured = training.with_options(**pipeline_args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85179f52-68f0-4c8d-9808-6b080bec72c3", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "fe_t_configured()" + ] + }, + { + "cell_type": "markdown", + "id": "0f40bd2e-14fb-4989-9545-a577a3be479a", + "metadata": {}, + "source": [ + "# Let's run the entire pipeline on Airflow now\n", + "\n", + "\n", + "\"Drawing\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "157dd948-6a55-466e-b711-c919eed7cd91", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2K\u001b[2;36mActive repository stack set to: \u001b[0m\u001b[2;32m'sagemaker-airflow-stack'\u001b[0m.\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Setting the repository active stack to 'sagemaker-airflow-stack'...k'...\u001b[0m\n", + "\u001b[1A\u001b[2K\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2K\u001b[32m⠏\u001b[0m Installing integrations.....Collecting apache-airflow~=2.4.0\n", + " Using cached apache_airflow-2.4.3-py3-none-any.whl (6.5 MB)\n", + "\u001b[2K\u001b[32m⠋\u001b[0m Installing integrations...Requirement already satisfied: cattrs>=22.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (23.2.3)\n", + "Requirement already satisfied: apache-airflow-providers-imap in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.3.2)\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Installing integrations...Requirement already satisfied: python-dateutil>=2.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.8.2)\n", + "Requirement already satisfied: blinker in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.6.3)\n", + "Requirement already satisfied: flask<2.3,>=2.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.2.5)\n", + "Requirement already satisfied: cryptography>=0.9.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (41.0.4)\n", + "Requirement already satisfied: werkzeug>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.2.3)\n", + "Requirement already satisfied: alembic<2.0,>=1.5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.8.1)\n", + "Requirement already satisfied: graphviz>=0.12 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.20.1)\n", + "Requirement already satisfied: croniter>=0.3.17 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.0.1)\n", + "Requirement already satisfied: markdown-it-py>=2.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.0.0)\n", + "Requirement already satisfied: sqlalchemy>=1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.4.41)\n", + "Requirement already satisfied: mdit-py-plugins>=0.3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.4.0)\n", + "Requirement already satisfied: sqlalchemy-jsonfield>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.0.2)\n", + "Requirement already satisfied: tenacity>=6.2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (8.2.3)\n", + "Requirement already satisfied: jsonschema>=3.2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (4.19.1)\n", + "Requirement already satisfied: pygments>=2.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.16.1)\n", + "Requirement already satisfied: apache-airflow-providers-ftp in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.5.2)\n", + "Requirement already satisfied: marshmallow-oneofschema>=2.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.0.1)\n", + "Requirement already satisfied: python-daemon>=2.2.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.0.1)\n", + "Requirement already satisfied: packaging>=14.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (23.2)\n", + "Requirement already satisfied: setproctitle>=1.1.8 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.3.3)\n", + "Requirement already satisfied: apache-airflow-providers-http in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (4.5.2)\n", + "Requirement already satisfied: unicodecsv>=0.14.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.14.1)\n", + "Requirement already satisfied: lockfile>=0.12.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.12.2)\n", + "Requirement already satisfied: itsdangerous>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.1.2)\n", + "Requirement already satisfied: lazy-object-proxy in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.9.0)\n", + "Requirement already satisfied: linkify-it-py>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.0.2)\n", + "Requirement already satisfied: pathspec~=0.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.9.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.3.0)\n", + "Requirement already satisfied: markupsafe>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.1.3)\n", + "Requirement already satisfied: cron-descriptor>=1.2.24 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.4.0)\n", + "Requirement already satisfied: connexion[flask,swagger-ui]>=2.10.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.14.2)\n", + "Requirement already satisfied: tabulate>=0.7.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.9.0)\n", + "Requirement already satisfied: configupdater>=3.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.2)\n", + "Requirement already satisfied: apache-airflow-providers-common-sql in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.7.2)\n", + "Requirement already satisfied: flask-appbuilder==4.1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (4.1.4)\n", + "Requirement already satisfied: pyjwt>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.7.0)\n", + "Requirement already satisfied: markdown>=3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.5)\n", + "Requirement already satisfied: pluggy>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.3.0)\n", + "Requirement already satisfied: jinja2>=3.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.1.2)\n", + "Requirement already satisfied: attrs>=22.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (22.2.0)\n", + "Requirement already satisfied: argcomplete>=1.10 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.1.6)\n", + "Requirement already satisfied: flask-wtf>=0.15 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.2.1)\n", + "Requirement already satisfied: colorlog<5.0,>=4.0.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (4.8.0)\n", + "Requirement already satisfied: httpx in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.23.3)\n", + "Requirement already satisfied: flask-session>=0.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.5.0)\n", + "Requirement already satisfied: gunicorn>=20.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (21.2.0)\n", + "Requirement already satisfied: pendulum>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.1.2)\n", + "Requirement already satisfied: psutil>=4.2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (5.9.6)\n", + "Requirement already satisfied: apache-airflow-providers-sqlite in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (3.4.3)\n", + "Requirement already satisfied: flask-login>=0.6.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.6.3)\n", + "Requirement already satisfied: python-nvd3>=0.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.15.0)\n", + "Requirement already satisfied: flask-caching>=1.5.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (2.1.0)\n", + "Requirement already satisfied: dill>=0.2.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (0.3.7)\n", + "Requirement already satisfied: python-slugify>=5.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (8.0.1)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (4.8.0)\n", + "Requirement already satisfied: deprecated>=1.2.13 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (1.2.14)\n", + "Requirement already satisfied: rich>=12.4.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.4.0) (13.6.0)\n", + "Requirement already satisfied: apispec[yaml]<4,>=3.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (3.3.2)\n", + "Requirement already satisfied: email-validator<2,>=1.0.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (1.3.1)\n", + "Requirement already satisfied: prison<1.0.0,>=0.2.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (0.2.1)\n", + "Requirement already satisfied: marshmallow-sqlalchemy<0.27.0,>=0.22.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (0.26.1)\n", + "Requirement already satisfied: WTForms<4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (3.0.1)\n", + "Requirement already satisfied: Flask-JWT-Extended<5.0.0,>=4.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (4.5.3)\n", + "Requirement already satisfied: marshmallow-enum<2,>=1.5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (1.5.1)\n", + "Requirement already satisfied: click<9,>=8 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (8.1.3)\n", + "Requirement already satisfied: Flask-Babel<3,>=1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (2.0.0)\n", + "Requirement already satisfied: colorama<1,>=0.3.9 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (0.4.6)\n", + "Requirement already satisfied: Flask-SQLAlchemy<3,>=2.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (2.5.1)\n", + "Requirement already satisfied: sqlalchemy-utils<1,>=0.32.21 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (0.38.3)\n", + "Requirement already satisfied: marshmallow<4,>=3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (3.20.1)\n", + "Requirement already satisfied: Mako in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from alembic<2.0,>=1.5.1->apache-airflow~=2.4.0) (1.2.4)\n", + "\u001b[2K\u001b[32m⠹\u001b[0m Installing integrations...Requirement already satisfied: exceptiongroup>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cattrs>=22.1.0->apache-airflow~=2.4.0) (1.1.3)\n", + "Collecting attrs>=22.1.0\n", + "\u001b[2K\u001b[32m⠸\u001b[0m Installing integrations... Using cached attrs-23.1.0-py3-none-any.whl (61 kB)\n", + "Requirement already satisfied: clickclick<21,>=1.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (20.10.2)\n", + "Requirement already satisfied: PyYAML<7,>=5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (6.0.1)\n", + "Requirement already satisfied: inflection<0.6,>=0.3.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (0.5.1)\n", + "Requirement already satisfied: requests<3,>=2.9.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (2.31.0)\n", + "Requirement already satisfied: swagger-ui-bundle<0.1,>=0.0.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (0.0.9)\n", + "Requirement already satisfied: pytz>2021.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from croniter>=0.3.17->apache-airflow~=2.4.0) (2023.3.post1)\n", + "\u001b[2K\u001b[32m⠼\u001b[0m Installing integrations...Requirement already satisfied: cffi>=1.12 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cryptography>=0.9.3->apache-airflow~=2.4.0) (1.16.0)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from deprecated>=1.2.13->apache-airflow~=2.4.0) (1.14.1)\n", + "Requirement already satisfied: importlib-metadata>=3.6.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask<2.3,>=2.2->apache-airflow~=2.4.0) (4.13.0)\n", + "Requirement already satisfied: cachelib<0.10.0,>=0.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-caching>=1.5.0->apache-airflow~=2.4.0) (0.9.0)\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Installing integrations...Requirement already satisfied: rpds-py>=0.7.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=3.2.0->apache-airflow~=2.4.0) (0.10.6)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=3.2.0->apache-airflow~=2.4.0) (0.30.2)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=3.2.0->apache-airflow~=2.4.0) (2023.7.1)\n", + "Requirement already satisfied: uc-micro-py in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from linkify-it-py>=2.0.0->apache-airflow~=2.4.0) (1.0.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from markdown-it-py>=2.1.0->apache-airflow~=2.4.0) (0.1.2)\n", + "\u001b[2K\u001b[32m⠦\u001b[0m Installing integrations...Requirement already satisfied: pytzdata>=2020.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pendulum>=2.0->apache-airflow~=2.4.0) (2020.1)\n", + "\u001b[2K\u001b[32m⠧\u001b[0m Installing integrations...Requirement already satisfied: docutils in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-daemon>=2.2.4->apache-airflow~=2.4.0) (0.20.1)\n", + "Requirement already satisfied: setuptools>=62.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-daemon>=2.2.4->apache-airflow~=2.4.0) (69.0.2)\n", + "Requirement already satisfied: six>=1.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-dateutil>=2.3->apache-airflow~=2.4.0) (1.16.0)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-slugify>=5.0->apache-airflow~=2.4.0) (1.3)\n", + "\u001b[2K\u001b[32m⠇\u001b[0m Installing integrations...Requirement already satisfied: greenlet!=0.4.17 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sqlalchemy>=1.4->apache-airflow~=2.4.0) (3.0.0)\n", + "\u001b[2K\u001b[32m⠏\u001b[0m Installing integrations...Requirement already satisfied: sqlparse>=0.4.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-common-sql->apache-airflow~=2.4.0) (0.4.4)\n", + "Requirement already satisfied: requests-toolbelt in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.4.0) (0.10.1)\n", + "Requirement already satisfied: aiohttp in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.4.0) (3.8.6)\n", + "Requirement already satisfied: asgiref in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.4.0) (3.7.2)\n", + "Requirement already satisfied: certifi in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.4.0) (2023.7.22)\n", + "\u001b[2K\u001b[32m⠋\u001b[0m Installing integrations...Requirement already satisfied: httpcore<0.17.0,>=0.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.4.0) (0.16.3)\n", + "Requirement already satisfied: sniffio in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.4.0) (1.3.0)\n", + "Requirement already satisfied: rfc3986[idna2008]<2,>=1.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.4.0) (1.5.0)\n", + "Requirement already satisfied: pycparser in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cffi>=1.12->cryptography>=0.9.3->apache-airflow~=2.4.0) (2.21)\n", + "Requirement already satisfied: dnspython>=1.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from email-validator<2,>=1.0.5->flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (2.4.2)\n", + "Requirement already satisfied: idna>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from email-validator<2,>=1.0.5->flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (3.4)\n", + "\u001b[2K\u001b[32m⠙\u001b[0m Installing integrations...Requirement already satisfied: Babel>=2.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from Flask-Babel<3,>=1->flask-appbuilder==4.1.4->apache-airflow~=2.4.0) (2.13.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpcore<0.17.0,>=0.15.0->httpx->apache-airflow~=2.4.0) (0.14.0)\n", + "Requirement already satisfied: anyio<5.0,>=3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpcore<0.17.0,>=0.15.0->httpx->apache-airflow~=2.4.0) (4.0.0)\n", + "\u001b[2K\u001b[32m⠹\u001b[0m Installing integrations...Requirement already satisfied: zipp>=0.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from importlib-metadata>=3.6.0->flask<2.3,>=2.2->apache-airflow~=2.4.0) (3.17.0)\n", + "\u001b[2K\u001b[32m⠸\u001b[0m Installing integrations...Requirement already satisfied: charset-normalizer<4,>=2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from requests<3,>=2.9.1->connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (3.3.1)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from requests<3,>=2.9.1->connexion[flask,swagger-ui]>=2.10.0->apache-airflow~=2.4.0) (1.26.18)\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Installing integrations...Requirement already satisfied: multidict<7.0,>=4.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.4.0) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.4.0) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.4.0) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.4.0) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.4.0) (4.0.3)\n", + "\u001b[2K\u001b[32m⠇\u001b[0m Installing integrations...Installing collected packages: attrs, apache-airflow\n", + " Attempting uninstall: attrs\n", + " Found existing installation: attrs 22.2.0\n", + " Uninstalling attrs-22.2.0:\n", + " Successfully uninstalled attrs-22.2.0\n", + "\u001b[2K\u001b[32m⠏\u001b[0m Installing integrations... Attempting uninstall: apache-airflow\n", + " Found existing installation: apache-airflow 2.5.3\n", + "\u001b[2K\u001b[32m⠸\u001b[0m Installing integrations... Uninstalling apache-airflow-2.5.3:\n", + " Successfully uninstalled apache-airflow-2.5.3\n", + "\u001b[2K\u001b[32m⠴\u001b[0m Installing integrations...\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavior is the source of the following dependency conflicts.\n", + "sagemaker 2.117.0 requires attrs<23,>=20.3.0, but you have attrs 23.1.0 which is incompatible.\n", + "apache-airflow-providers-docker 3.8.2 requires apache-airflow>=2.5.0, but you have apache-airflow 2.4.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed apache-airflow-2.4.3 attrs-23.1.0\n", + "\u001b[2K\u001b[32m⠦\u001b[0m Installing integrations...\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.3.1 is available.\n", + "You should consider upgrading via the '/home/apenner/.pyenv/versions/3.9.13/envs/demo/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[2K\u001b[32m⠋\u001b[0m Installing integrations...\n", + "\u001b[1A\u001b[2KRequirement already satisfied: apache-airflow-providers-docker in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (3.8.2)\n", + "Collecting apache-airflow~=2.5.0\n", + " Using cached apache_airflow-2.5.3-py3-none-any.whl (11.6 MB)\n", + "Requirement already satisfied: python-dotenv>=0.21.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-docker) (1.0.0)\n", + "Requirement already satisfied: docker>=5.0.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-docker) (6.1.3)\n", + "Requirement already satisfied: linkify-it-py>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.0.2)\n", + "Requirement already satisfied: tenacity!=8.2.0,>=6.2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (8.2.3)\n", + "Requirement already satisfied: configupdater>=3.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.2)\n", + "Requirement already satisfied: jsonschema>=4.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (4.19.1)\n", + "Requirement already satisfied: lockfile>=0.12.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.12.2)\n", + "Requirement already satisfied: cryptography>=0.9.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (41.0.4)\n", + "Requirement already satisfied: apache-airflow-providers-common-sql in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.7.2)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.3.0)\n", + "Requirement already satisfied: jinja2>=3.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.1.2)\n", + "Requirement already satisfied: apache-airflow-providers-imap in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.3.2)\n", + "Requirement already satisfied: flask-wtf>=0.15 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.2.1)\n", + "Requirement already satisfied: python-slugify>=5.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (8.0.1)\n", + "Requirement already satisfied: apache-airflow-providers-http in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (4.5.2)\n", + "Requirement already satisfied: setproctitle>=1.1.8 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.3.3)\n", + "Requirement already satisfied: sqlalchemy<2.0,>=1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.4.41)\n", + "Requirement already satisfied: python-dateutil>=2.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.8.2)\n", + "Requirement already satisfied: colorlog<5.0,>=4.0.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (4.8.0)\n", + "Requirement already satisfied: flask-session>=0.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.5.0)\n", + "Requirement already satisfied: markdown-it-py>=2.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.0.0)\n", + "Requirement already satisfied: rfc3339-validator>=0.1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.1.4)\n", + "Requirement already satisfied: croniter>=0.3.17 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.0.1)\n", + "Requirement already satisfied: psutil>=4.2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (5.9.6)\n", + "Requirement already satisfied: cron-descriptor>=1.2.24 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.4.0)\n", + "Requirement already satisfied: dill>=0.2.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.3.7)\n", + "Requirement already satisfied: flask-caching>=1.5.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.1.0)\n", + "Requirement already satisfied: deprecated>=1.2.13 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.2.14)\n", + "Requirement already satisfied: markupsafe>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.1.3)\n", + "Requirement already satisfied: argcomplete>=1.10 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.1.6)\n", + "Requirement already satisfied: gunicorn>=20.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (21.2.0)\n", + "Requirement already satisfied: itsdangerous>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.1.2)\n", + "Requirement already satisfied: graphviz>=0.12 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.20.1)\n", + "Requirement already satisfied: apache-airflow-providers-ftp in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.5.2)\n", + "Requirement already satisfied: connexion[flask]>=2.10.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.14.2)\n", + "Requirement already satisfied: cattrs>=22.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (23.2.3)\n", + "Requirement already satisfied: attrs>=22.1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (23.1.0)\n", + "Requirement already satisfied: pendulum>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.1.2)\n", + "Requirement already satisfied: pluggy>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.3.0)\n", + "Requirement already satisfied: flask<2.3,>=2.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.2.5)\n", + "Requirement already satisfied: rich>=12.4.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (13.6.0)\n", + "Requirement already satisfied: sqlalchemy-jsonfield>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.0.2)\n", + "Requirement already satisfied: packaging>=14.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (23.2)\n", + "Requirement already satisfied: lazy-object-proxy in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.9.0)\n", + "Requirement already satisfied: markdown>=3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.5)\n", + "Requirement already satisfied: pyjwt>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.7.0)\n", + "Requirement already satisfied: python-nvd3>=0.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.15.0)\n", + "Requirement already satisfied: httpx in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.23.3)\n", + "Requirement already satisfied: pathspec~=0.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.9.0)\n", + "Requirement already satisfied: pygments>=2.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.16.1)\n", + "Requirement already satisfied: apache-airflow-providers-sqlite in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.4.3)\n", + "Requirement already satisfied: alembic<2.0,>=1.6.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.8.1)\n", + "Requirement already satisfied: tabulate>=0.7.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.9.0)\n", + "Requirement already satisfied: flask-login>=0.6.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.6.3)\n", + "Requirement already satisfied: mdit-py-plugins>=0.3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.4.0)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (4.8.0)\n", + "Requirement already satisfied: blinker in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (1.6.3)\n", + "Requirement already satisfied: unicodecsv>=0.14.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (0.14.1)\n", + "Requirement already satisfied: werkzeug>=2.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (2.2.3)\n", + "Requirement already satisfied: python-daemon>=3.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.0.1)\n", + "Requirement already satisfied: marshmallow-oneofschema>=2.0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (3.0.1)\n", + "Requirement already satisfied: flask-appbuilder==4.1.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow~=2.5.0) (4.1.4)\n", + "Requirement already satisfied: marshmallow-sqlalchemy<0.27.0,>=0.22.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (0.26.1)\n", + "Requirement already satisfied: prison<1.0.0,>=0.2.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (0.2.1)\n", + "Requirement already satisfied: marshmallow<4,>=3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (3.20.1)\n", + "Requirement already satisfied: Flask-SQLAlchemy<3,>=2.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (2.5.1)\n", + "Requirement already satisfied: Flask-Babel<3,>=1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (2.0.0)\n", + "Requirement already satisfied: Flask-JWT-Extended<5.0.0,>=4.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (4.5.3)\n", + "Requirement already satisfied: marshmallow-enum<2,>=1.5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (1.5.1)\n", + "Requirement already satisfied: colorama<1,>=0.3.9 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (0.4.6)\n", + "Requirement already satisfied: sqlalchemy-utils<1,>=0.32.21 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (0.38.3)\n", + "Requirement already satisfied: email-validator<2,>=1.0.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (1.3.1)\n", + "Requirement already satisfied: apispec[yaml]<4,>=3.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (3.3.2)\n", + "Requirement already satisfied: WTForms<4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (3.0.1)\n", + "Requirement already satisfied: click<9,>=8 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (8.1.3)\n", + "Requirement already satisfied: Mako in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from alembic<2.0,>=1.6.3->apache-airflow~=2.5.0) (1.2.4)\n", + "Requirement already satisfied: exceptiongroup>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cattrs>=22.1.0->apache-airflow~=2.5.0) (1.1.3)\n", + "Requirement already satisfied: requests<3,>=2.9.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask]>=2.10.0->apache-airflow~=2.5.0) (2.31.0)\n", + "Requirement already satisfied: inflection<0.6,>=0.3.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask]>=2.10.0->apache-airflow~=2.5.0) (0.5.1)\n", + "Requirement already satisfied: PyYAML<7,>=5.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask]>=2.10.0->apache-airflow~=2.5.0) (6.0.1)\n", + "Requirement already satisfied: clickclick<21,>=1.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from connexion[flask]>=2.10.0->apache-airflow~=2.5.0) (20.10.2)\n", + "Requirement already satisfied: pytz>2021.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from croniter>=0.3.17->apache-airflow~=2.5.0) (2023.3.post1)\n", + "Requirement already satisfied: cffi>=1.12 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cryptography>=0.9.3->apache-airflow~=2.5.0) (1.16.0)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from deprecated>=1.2.13->apache-airflow~=2.5.0) (1.14.1)\n", + "Requirement already satisfied: websocket-client>=0.32.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from docker>=5.0.3->apache-airflow-providers-docker) (1.6.4)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from docker>=5.0.3->apache-airflow-providers-docker) (1.26.18)\n", + "Requirement already satisfied: importlib-metadata>=3.6.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask<2.3,>=2.2->apache-airflow~=2.5.0) (4.13.0)\n", + "Requirement already satisfied: cachelib<0.10.0,>=0.9.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from flask-caching>=1.5.0->apache-airflow~=2.5.0) (0.9.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=4.0.0->apache-airflow~=2.5.0) (2023.7.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=4.0.0->apache-airflow~=2.5.0) (0.30.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from jsonschema>=4.0.0->apache-airflow~=2.5.0) (0.10.6)\n", + "Requirement already satisfied: uc-micro-py in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from linkify-it-py>=2.0.0->apache-airflow~=2.5.0) (1.0.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from markdown-it-py>=2.1.0->apache-airflow~=2.5.0) (0.1.2)\n", + "Requirement already satisfied: pytzdata>=2020.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from pendulum>=2.0->apache-airflow~=2.5.0) (2020.1)\n", + "Requirement already satisfied: setuptools>=62.4.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-daemon>=3.0.0->apache-airflow~=2.5.0) (69.0.2)\n", + "Requirement already satisfied: docutils in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-daemon>=3.0.0->apache-airflow~=2.5.0) (0.20.1)\n", + "Requirement already satisfied: six>=1.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-dateutil>=2.3->apache-airflow~=2.5.0) (1.16.0)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from python-slugify>=5.0->apache-airflow~=2.5.0) (1.3)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from sqlalchemy<2.0,>=1.4->apache-airflow~=2.5.0) (3.0.0)\n", + "Requirement already satisfied: sqlparse>=0.4.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-common-sql->apache-airflow~=2.5.0) (0.4.4)\n", + "Requirement already satisfied: requests-toolbelt in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.5.0) (0.10.1)\n", + "Requirement already satisfied: aiohttp in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.5.0) (3.8.6)\n", + "Requirement already satisfied: asgiref in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from apache-airflow-providers-http->apache-airflow~=2.5.0) (3.7.2)\n", + "Requirement already satisfied: certifi in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.5.0) (2023.7.22)\n", + "Requirement already satisfied: sniffio in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.5.0) (1.3.0)\n", + "Requirement already satisfied: rfc3986[idna2008]<2,>=1.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.5.0) (1.5.0)\n", + "Requirement already satisfied: httpcore<0.17.0,>=0.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpx->apache-airflow~=2.5.0) (0.16.3)\n", + "Requirement already satisfied: pycparser in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from cffi>=1.12->cryptography>=0.9.3->apache-airflow~=2.5.0) (2.21)\n", + "Requirement already satisfied: idna>=2.0.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from email-validator<2,>=1.0.5->flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (3.4)\n", + "Requirement already satisfied: dnspython>=1.15.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from email-validator<2,>=1.0.5->flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (2.4.2)\n", + "Requirement already satisfied: Babel>=2.3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from Flask-Babel<3,>=1->flask-appbuilder==4.1.4->apache-airflow~=2.5.0) (2.13.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpcore<0.17.0,>=0.15.0->httpx->apache-airflow~=2.5.0) (0.14.0)\n", + "Requirement already satisfied: anyio<5.0,>=3.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from httpcore<0.17.0,>=0.15.0->httpx->apache-airflow~=2.5.0) (4.0.0)\n", + "Requirement already satisfied: zipp>=0.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from importlib-metadata>=3.6.0->flask<2.3,>=2.2->apache-airflow~=2.5.0) (3.17.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from requests<3,>=2.9.1->connexion[flask]>=2.10.0->apache-airflow~=2.5.0) (3.3.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.5.0) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.5.0) (4.0.3)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.5.0) (6.0.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.5.0) (1.4.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /home/apenner/.pyenv/versions/3.9.13/envs/demo/lib/python3.9/site-packages (from aiohttp->apache-airflow-providers-http->apache-airflow~=2.5.0) (1.9.2)\n", + "Installing collected packages: apache-airflow\n", + " Attempting uninstall: apache-airflow\n", + " Found existing installation: apache-airflow 2.4.3\n", + " Uninstalling apache-airflow-2.4.3:\n", + " Successfully uninstalled apache-airflow-2.4.3\n", + "Successfully installed apache-airflow-2.5.3\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.3.1 is available.\n", + "You should consider upgrading via the '/home/apenner/.pyenv/versions/3.9.13/envs/demo/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", + "\u001b[2;36mProvisioning resources for active stack \u001b[0m\u001b[2;32m'sagemaker-airflow-stack'\u001b[0m\u001b[2;36m.\u001b[0m\n", + "\u001b[1;35mProvisioning resources for stack 'sagemaker-airflow-stack'.\u001b[0m\n", + "\u001b[1;35mReading the config from /home/apenner/.config/zenml/airflow/f3b0bda3-5245-4134-8ad9-6b42affebcf5/airflow.cfg\u001b[0m\n", + "\u001b[1;35mConfigured default timezone Timezone('UTC')\u001b[0m\n", + "\u001b[1;35mResuming provisioned resources for stack sagemaker-airflow-stack.\u001b[0m\n" + ] + } + ], + "source": [ + "!zenml stack set sagemaker-airflow-stack\n", + "!zenml integration install airflow -y\n", + "!pip install apache-airflow-providers-docker apache-airflow~=2.5.0\n", + "!zenml stack up" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e188c8c5-3e8f-42b2-9f98-380c265cf8ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mtraining\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mReloading configuration file /home/apenner/PycharmProjects/zenml-projects/stack-showcase/.zen/config.yaml\u001b[0m\n", + "\u001b[1;35mReusing registered version: \u001b[0m\u001b[1;36m(version: 8)\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mNew model version \u001b[0m\u001b[1;36m11\u001b[1;35m was created.\u001b[0m\n", + "\u001b[1;35mBuilding Docker image(s) for pipeline \u001b[0m\u001b[1;36mtraining\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mBuilding Docker image \u001b[0m\u001b[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35m- Including user-defined requirements: \u001b[0m\u001b[1;36mpyarrow\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m- Including integration requirements: \u001b[0m\u001b[1;36mapache-airflow~=2.4.0\u001b[1;35m, \u001b[0m\u001b[1;36maws-profile-manager\u001b[1;35m, \u001b[0m\u001b[1;36mboto3<=1.26.76\u001b[1;35m, \u001b[0m\u001b[1;36mkubernetes\u001b[1;35m, \u001b[0m\u001b[1;36ms3fs>2022.3.0,<=2023.4.0\u001b[1;35m, \u001b[0m\u001b[1;36msagemaker==2.117.0\u001b[1;35m, \u001b[0m\u001b[1;36mscikit-learn<1.3\u001b[1;35m\u001b[0m\n", + "\u001b[33mCould not import Azure service connector: No module named 'azure.identity'.\u001b[0m\n", + "\u001b[1;35mStep 1/10 : FROM zenmldocker/zenml:0.50.0-py3.9\u001b[0m\n", + "\u001b[1;35mStep 2/10 : WORKDIR /app\u001b[0m\n", + "\u001b[1;35mStep 3/10 : COPY .zenml_user_requirements .\u001b[0m\n", + "\u001b[1;35mStep 4/10 : RUN pip install --default-timeout=60 --no-cache-dir -r .zenml_user_requirements\u001b[0m\n", + "\u001b[1;35mStep 5/10 : COPY .zenml_integration_requirements .\u001b[0m\n", + "\u001b[1;35mStep 6/10 : RUN pip install --default-timeout=60 --no-cache-dir -r .zenml_integration_requirements\u001b[0m\n", + "\u001b[1;35mStep 7/10 : ENV ZENML_ENABLE_REPO_INIT_WARNINGS=False\u001b[0m\n", + "\u001b[1;35mStep 8/10 : ENV ZENML_CONFIG_PATH=/app/.zenconfig\u001b[0m\n", + "\u001b[1;35mStep 9/10 : COPY . .\u001b[0m\n", + "\u001b[1;35mStep 10/10 : RUN chmod -R a+rw .\u001b[0m\n", + "\u001b[1;35mFound credentials in shared credentials file: ~/.aws/credentials\u001b[0m\n", + "\u001b[33mAmazon ECR requires you to create a repository before you can push an image to it. ZenML is trying to push the image 715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator but could not find any repositories because your local AWS credentials are not set. We will try to push anyway, but in case it fails you need to create a repository named \u001b[0m\u001b[1;36mzenml\u001b[33m.\u001b[0m\n", + "\u001b[1;35mPushing Docker image \u001b[0m\u001b[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-orchestrator\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mFinished pushing Docker image.\u001b[0m\n", + "\u001b[1;35mBuilding Docker image \u001b[0m\u001b[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-model_trainer-sagemaker_step_operator\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35m- Including user-defined requirements: \u001b[0m\u001b[1;36mpyarrow\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m- Including integration requirements: \u001b[0m\u001b[1;36mapache-airflow~=2.4.0\u001b[1;35m, \u001b[0m\u001b[1;36maws-profile-manager\u001b[1;35m, \u001b[0m\u001b[1;36mboto3<=1.26.76\u001b[1;35m, \u001b[0m\u001b[1;36mkubernetes\u001b[1;35m, \u001b[0m\u001b[1;36ms3fs>2022.3.0,<=2023.4.0\u001b[1;35m, \u001b[0m\u001b[1;36msagemaker==2.117.0\u001b[1;35m, \u001b[0m\u001b[1;36mscikit-learn<1.3\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mStep 1/11 : FROM zenmldocker/zenml:0.50.0-py3.9\u001b[0m\n", + "\u001b[1;35mStep 2/11 : WORKDIR /app\u001b[0m\n", + "\u001b[1;35mStep 3/11 : COPY .zenml_user_requirements .\u001b[0m\n", + "\u001b[1;35mStep 4/11 : RUN pip install --default-timeout=60 --no-cache-dir -r .zenml_user_requirements\u001b[0m\n", + "\u001b[1;35mStep 5/11 : COPY .zenml_integration_requirements .\u001b[0m\n", + "\u001b[1;35mStep 6/11 : RUN pip install --default-timeout=60 --no-cache-dir -r .zenml_integration_requirements\u001b[0m\n", + "\u001b[1;35mStep 7/11 : ENV ZENML_ENABLE_REPO_INIT_WARNINGS=False\u001b[0m\n", + "\u001b[1;35mStep 8/11 : ENV ZENML_CONFIG_PATH=/app/.zenconfig\u001b[0m\n", + "\u001b[1;35mStep 9/11 : COPY . .\u001b[0m\n", + "\u001b[1;35mStep 10/11 : RUN chmod -R a+rw .\u001b[0m\n", + "\u001b[1;35mStep 11/11 : ENTRYPOINT $__ZENML_ENTRYPOINT\u001b[0m\n", + "\u001b[33mAmazon ECR requires you to create a repository before you can push an image to it. ZenML is trying to push the image 715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-model_trainer-sagemaker_step_operator but could not find any repositories because your local AWS credentials are not set. We will try to push anyway, but in case it fails you need to create a repository named \u001b[0m\u001b[1;36mzenml\u001b[33m.\u001b[0m\n", + "\u001b[1;35mPushing Docker image \u001b[0m\u001b[1;36m715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml:training-model_trainer-sagemaker_step_operator\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mFinished pushing Docker image.\u001b[0m\n", + "\u001b[1;35mFinished building Docker image(s).\u001b[0m\n", + "\u001b[1;35mReading the config from /home/apenner/.config/zenml/airflow/f3b0bda3-5245-4134-8ad9-6b42affebcf5/airflow.cfg\u001b[0m\n", + "\u001b[1;35mConfigured default timezone Timezone('UTC')\u001b[0m\n", + "\u001b[1;35mExecuting a new run.\u001b[0m\n", + "\u001b[1;35mCaching is disabled by default for \u001b[0m\u001b[1;36mtraining\u001b[1;35m.\u001b[0m\n", + "\u001b[1;35mUsing a build:\u001b[0m\n", + "\u001b[1;35m Image(s): 715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml@sha256:609ab7153bb059a2e7eadef380391d45e09768ef1a482ebaf264bac6f20b71d7, 715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml@sha256:bd4a950a3f8962b2dc3dca017648ba5346958bdb1e89e31a570a6b6196011e41\u001b[0m\n", + "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36msagemaker-airflow-stack\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m step_operator: \u001b[0m\u001b[1;36msagemaker-eu\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m container_registry: \u001b[0m\u001b[1;36maws-eu\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m image_builder: \u001b[0m\u001b[1;36mlocal\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mairflow_orchestrator\u001b[1;35m\u001b[0m\n", + "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36ms3-zenfiles\u001b[1;35m\u001b[0m\n", + "\u001b[1;35mWriting DAG definition to \u001b[0m\u001b[1;36m/home/apenner/.config/zenml/airflow/f3b0bda3-5245-4134-8ad9-6b42affebcf5/dags/training_ 761957255a9c2a84186beb62cb17f90.zip\u001b[1;35m.\u001b[0m\n", + "\u001b[33mYour orchestrator 'airflow_orchestrator' is running remotely. Note that the pipeline run will only show up on the ZenML dashboard once the first step has started executing on the remote infrastructure.\u001b[0m\n" + ] + } + ], + "source": [ + "fe_t_configured()" + ] + }, + { + "cell_type": "markdown", + "id": "0841f93b-9eb5-4af6-bba7-cec167024ccf", + "metadata": {}, + "source": [ + "# Switch to full Sagemaker Stack\n", + "\n", + "![Sagemaker local stack](_assets/sagemaker_stack.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8e33484-3377-4f0e-83fa-87d7c0ca4d72", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set sagemaker-stack" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a03c95e9-df2e-446c-8d61-9cc37ad8a46a", + "metadata": {}, + "outputs": [], + "source": [ + "fe_t_configured()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5e46721-3733-439e-a03e-54512552eed7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90d8044-659c-4b45-bf84-9b3ef69749f9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/stack-showcase/steps/__init__.py b/stack-showcase/steps/__init__.py new file mode 100644 index 00000000..2f856719 --- /dev/null +++ b/stack-showcase/steps/__init__.py @@ -0,0 +1,26 @@ +# {% include 'template/license_header' %} + +from .data_loader import ( + data_loader, +) +from .data_preprocessor import ( + data_preprocessor, +) +from .data_splitter import ( + data_splitter, +) +from .inference_predict import ( + inference_predict, +) +from .inference_preprocessor import ( + inference_preprocessor, +) +from .model_evaluator import ( + model_evaluator, +) +from .model_trainer import ( + model_trainer, +) +from .model_promoter import ( + model_promoter, +) \ No newline at end of file diff --git a/stack-showcase/steps/data_loader.py b/stack-showcase/steps/data_loader.py new file mode 100644 index 00000000..cc6df580 --- /dev/null +++ b/stack-showcase/steps/data_loader.py @@ -0,0 +1,53 @@ +# {% include 'template/license_header' %} + +import pandas as pd +from sklearn.datasets import load_breast_cancer +from typing_extensions import Annotated +from zenml import log_artifact_metadata, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def data_loader( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + is_inference: If `True` subset will be returned and target column + will be removed from dataset. + random_state: Random state for sampling + target: Name of target columns in dataset. + + Returns: + The dataset artifact as Pandas DataFrame and name of target column. + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample(inference_size, random_state=random_state) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + + # Recording metadata for this dataset + log_artifact_metadata(metadata={"random_state": random_state, target: target}) + + ### YOUR CODE ENDS HERE ### + return dataset diff --git a/stack-showcase/steps/data_preprocessor.py b/stack-showcase/steps/data_preprocessor.py new file mode 100644 index 00000000..34b3aeb6 --- /dev/null +++ b/stack-showcase/steps/data_preprocessor.py @@ -0,0 +1,78 @@ +# {% include 'template/license_header' %} + +from typing import List, Optional, Tuple + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Annotated +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper +from zenml import log_artifact_metadata, step + + +@step +def data_preprocessor( + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +) -> Tuple[ + Annotated[pd.DataFrame, "dataset_trn"], + Annotated[pd.DataFrame, "dataset_tst"], + Annotated[Pipeline, "preprocess_pipeline"], +]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model training. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps like cleaning, + feature engineering, feature selection, etc. It then returns the processed + dataset as an step output artifact. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to drop NA values, drop some + columns and normalize numerical columns. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + dataset_trn: The train dataset. + dataset_tst: The test dataset. + drop_na: If `True` all NA rows will be dropped. + normalize: If `True` all numeric fields will be normalized. + drop_columns: List of column names to drop. + + Returns: + The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # We use the sklearn pipeline to chain together multiple preprocessing steps + preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) + if drop_na: + preprocess_pipeline.steps.append(("drop_na", NADropper())) + if drop_columns: + # Drop columns + preprocess_pipeline.steps.append(("drop_columns", ColumnsDropper(drop_columns))) + if normalize: + # Normalize the data + preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) + preprocess_pipeline.steps.append(("cast", DataFrameCaster(dataset_trn.columns))) + dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) + dataset_tst = preprocess_pipeline.transform(dataset_tst) + + # Log metadata of target to both datasets + log_artifact_metadata( + artifact_name="dataset_trn", + metadata={"target": target}, + ) + log_artifact_metadata( + artifact_name="dataset_tst", + metadata={"target": target}, + ) + + ### YOUR CODE ENDS HERE ### + return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/stack-showcase/steps/data_splitter.py b/stack-showcase/steps/data_splitter.py new file mode 100644 index 00000000..dbab8157 --- /dev/null +++ b/stack-showcase/steps/data_splitter.py @@ -0,0 +1,47 @@ +# {% include 'template/license_header' %} + +from typing import Tuple + +import pandas as pd +from sklearn.model_selection import train_test_split +from typing_extensions import Annotated +from zenml import step + + +@step +def data_splitter( + dataset: pd.DataFrame, test_size: float = 0.2 +) -> Tuple[ + Annotated[pd.DataFrame, "raw_dataset_trn"], + Annotated[pd.DataFrame, "raw_dataset_tst"], +]: + """Dataset splitter step. + + This is an example of a dataset splitter step that splits the data + into train and test set before passing it to ML model. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different test + set sizes. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + dataset: Dataset read from source. + test_size: 0.0..1.0 defining portion of test set. + + Returns: + The split dataset: dataset_trn, dataset_tst. + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + dataset_trn, dataset_tst = train_test_split( + dataset, + test_size=test_size, + random_state=42, + shuffle=True, + ) + dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) + dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) + ### YOUR CODE ENDS HERE ### + return dataset_trn, dataset_tst diff --git a/stack-showcase/steps/inference_predict.py b/stack-showcase/steps/inference_predict.py new file mode 100644 index 00000000..e214b0f2 --- /dev/null +++ b/stack-showcase/steps/inference_predict.py @@ -0,0 +1,60 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pandas as pd +from typing_extensions import Annotated +from zenml import get_step_context, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def inference_predict( + dataset_inf: pd.DataFrame, +) -> Annotated[pd.Series, "predictions"]: + """Predictions step. + + This is an example of a predictions step that takes the data in and returns + predicted values. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different input data. + See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + dataset_inf: The inference dataset. + + Returns: + The predictions as pandas series + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + model_version = get_step_context().model_version + + # run prediction from memory + predictor = model_version.load_artifact("model") + predictions = predictor.predict(dataset_inf) + + breakpoint() + predictions = pd.Series(predictions, name="predicted") + ### YOUR CODE ENDS HERE ### + + return predictions diff --git a/stack-showcase/steps/inference_preprocessor.py b/stack-showcase/steps/inference_preprocessor.py new file mode 100644 index 00000000..c7a5ae1c --- /dev/null +++ b/stack-showcase/steps/inference_preprocessor.py @@ -0,0 +1,52 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.pipeline import Pipeline +from typing_extensions import Annotated +from zenml import step + + +@step +def inference_preprocessor( + dataset_inf: pd.DataFrame, + preprocess_pipeline: Pipeline, + target: str, +) -> Annotated[pd.DataFrame, "inference_dataset"]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model inference. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps based on pretrained + preprocessing pipeline. + + Args: + dataset_inf: The inference dataset. + preprocess_pipeline: Pretrained `Pipeline` to process dataset. + target: Name of target columns in dataset. + + Returns: + The processed dataframe: dataset_inf. + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # artificially adding `target` column to avoid Pipeline issues + dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) + dataset_inf = preprocess_pipeline.transform(dataset_inf) + dataset_inf.drop(columns=["target"], inplace=True) + ### YOUR CODE ENDS HERE ### + + return dataset_inf diff --git a/stack-showcase/steps/model_evaluator.py b/stack-showcase/steps/model_evaluator.py new file mode 100644 index 00000000..27613641 --- /dev/null +++ b/stack-showcase/steps/model_evaluator.py @@ -0,0 +1,94 @@ +# {% include 'template/license_header' %} + +import pandas as pd +from sklearn.base import ClassifierMixin +from zenml import step, log_artifact_metadata +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_evaluator( + model: ClassifierMixin, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, +) -> float: + """Evaluate a trained model. + + This is an example of a model evaluation step that takes in a model artifact + previously trained by another step in your pipeline, and a training + and validation data set pair which it uses to evaluate the model's + performance. The model metrics are then returned as step output artifacts + (in this case, the model accuracy on the train and test set). + + The suggested step implementation also outputs some warnings if the model + performance does not meet some minimum criteria. This is just an example of + how you can use steps to monitor your model performance and alert you if + something goes wrong. As an alternative, you can raise an exception in the + step to force the pipeline run to fail early and all subsequent steps to + be skipped. + + This step is parameterized to configure the step independently of the step code, + before running it in a pipeline. In this example, the step can be configured + to use different values for the acceptable model performance thresholds and + to control whether the pipeline run should fail if the model performance + does not meet the minimum criteria. See the documentation for more + information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + model: The pre-trained model artifact. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + min_train_accuracy: Minimal acceptable training accuracy value. + min_test_accuracy: Minimal acceptable testing accuracy value. + fail_on_accuracy_quality_gates: If `True` a `RuntimeException` is raised + upon not meeting one of the minimal accuracy thresholds. + + Returns: + The model accuracy on the test set. + + Raises: + RuntimeError: if any of accuracies is lower than respective threshold + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + # context = get_step_context() + # target = context.inputs["dataset_trn"].run_metadata['target'].value + target = "target" + + # Calculate the model accuracy on the train and test set + trn_acc = model.score( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") + tst_acc = model.score( + dataset_tst.drop(columns=[target]), + dataset_tst[target], + ) + logger.info(f"Test accuracy={tst_acc*100:.2f}%") + + messages = [] + if trn_acc < min_train_accuracy: + messages.append( + f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + ) + if tst_acc < min_test_accuracy: + messages.append( + f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + ) + else: + for message in messages: + logger.warning(message) + + log_artifact_metadata( + metadata={"train_accuracy": float(trn_acc), "test_accuracy": float(tst_acc)}, + artifact_name="model", + ) + + ### YOUR CODE ENDS HERE ### + return float(trn_acc) diff --git a/stack-showcase/steps/model_promoter.py b/stack-showcase/steps/model_promoter.py new file mode 100644 index 00000000..784e0cc5 --- /dev/null +++ b/stack-showcase/steps/model_promoter.py @@ -0,0 +1,42 @@ +# {% include 'template/license_header' %} + +from zenml import get_step_context, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + is_promoted = False + else: + logger.info(f"Model promoted to {stage}!") + is_promoted = True + model_version = get_step_context().model_version + model_version.set_stage(stage, force=True) + + ### YOUR CODE ENDS HERE ### + return is_promoted diff --git a/stack-showcase/steps/model_trainer.py b/stack-showcase/steps/model_trainer.py new file mode 100644 index 00000000..bd305b2d --- /dev/null +++ b/stack-showcase/steps/model_trainer.py @@ -0,0 +1,50 @@ +# {% include 'template/license_header' %} + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.tree import DecisionTreeClassifier +from typing_extensions import Annotated +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step(enable_cache=False) +def model_trainer( + dataset_trn: pd.DataFrame, +) -> Annotated[ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True)]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + """ + + ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### + + # Use the dataset to fetch the target + # context = get_step_context() + # target = context.inputs["dataset_trn"].run_metadata['target'].value + target = "target" + + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + model = DecisionTreeClassifier() + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + ### YOUR CODE ENDS HERE ### + + return model diff --git a/stack-showcase/utils/__init__.py b/stack-showcase/utils/__init__.py new file mode 100644 index 00000000..4bc11e5e --- /dev/null +++ b/stack-showcase/utils/__init__.py @@ -0,0 +1 @@ +# {% include 'template/license_header' %} diff --git a/stack-showcase/utils/preprocess.py b/stack-showcase/utils/preprocess.py new file mode 100644 index 00000000..2dd4a859 --- /dev/null +++ b/stack-showcase/utils/preprocess.py @@ -0,0 +1,41 @@ +# {% include 'template/license_header' %} + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns)