diff --git a/.gitignore b/.gitignore index 00fba43c..ab6d468c 100644 --- a/.gitignore +++ b/.gitignore @@ -103,7 +103,7 @@ celerybeat.pid # Environments .env -.venv +.venv* env/ venv/ ENV/ @@ -142,3 +142,5 @@ mlruns/ zencoder/cloned_public_repos *wandb* + +.DS_Store \ No newline at end of file diff --git a/stack-showcase/.dockerignore b/classifier-e2e/.dockerignore similarity index 100% rename from stack-showcase/.dockerignore rename to classifier-e2e/.dockerignore diff --git a/stack-showcase/README.md b/classifier-e2e/README.md similarity index 100% rename from stack-showcase/README.md rename to classifier-e2e/README.md diff --git a/classifier-e2e/_assets/cloud_mcp.png b/classifier-e2e/_assets/cloud_mcp.png new file mode 100644 index 00000000..33117b6e Binary files /dev/null and b/classifier-e2e/_assets/cloud_mcp.png differ diff --git a/classifier-e2e/_assets/cloud_mcp_predictions.png b/classifier-e2e/_assets/cloud_mcp_predictions.png new file mode 100644 index 00000000..a6bf7c90 Binary files /dev/null and b/classifier-e2e/_assets/cloud_mcp_predictions.png differ diff --git a/classifier-e2e/_assets/cloud_mcp_screenshot.png b/classifier-e2e/_assets/cloud_mcp_screenshot.png new file mode 100644 index 00000000..8f56defa Binary files /dev/null and b/classifier-e2e/_assets/cloud_mcp_screenshot.png differ diff --git a/classifier-e2e/_assets/deployment_pipeline.png b/classifier-e2e/_assets/deployment_pipeline.png new file mode 100644 index 00000000..39b7a961 Binary files /dev/null and b/classifier-e2e/_assets/deployment_pipeline.png differ diff --git a/classifier-e2e/_assets/feature_engineering_pipeline.png b/classifier-e2e/_assets/feature_engineering_pipeline.png new file mode 100644 index 00000000..332e296e Binary files /dev/null and b/classifier-e2e/_assets/feature_engineering_pipeline.png differ diff --git a/classifier-e2e/_assets/inference_pipeline.png b/classifier-e2e/_assets/inference_pipeline.png new file mode 100644 index 00000000..d63da4a2 Binary files /dev/null and b/classifier-e2e/_assets/inference_pipeline.png differ diff --git a/classifier-e2e/_assets/pipeline_overview.png b/classifier-e2e/_assets/pipeline_overview.png new file mode 100644 index 00000000..9071da7d Binary files /dev/null and b/classifier-e2e/_assets/pipeline_overview.png differ diff --git a/classifier-e2e/_assets/sagemaker_stack.png b/classifier-e2e/_assets/sagemaker_stack.png new file mode 100644 index 00000000..fb811a4b Binary files /dev/null and b/classifier-e2e/_assets/sagemaker_stack.png differ diff --git a/classifier-e2e/_assets/training_pipeline.png b/classifier-e2e/_assets/training_pipeline.png new file mode 100644 index 00000000..85ec8ca0 Binary files /dev/null and b/classifier-e2e/_assets/training_pipeline.png differ diff --git a/classifier-e2e/configs/feature_engineering.yaml b/classifier-e2e/configs/feature_engineering.yaml new file mode 100644 index 00000000..d5ab2129 --- /dev/null +++ b/classifier-e2e/configs/feature_engineering.yaml @@ -0,0 +1,10 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# pipeline configuration +test_size: 0.35 \ No newline at end of file diff --git a/classifier-e2e/configs/inference.yaml b/classifier-e2e/configs/inference.yaml new file mode 100644 index 00000000..52421c4c --- /dev/null +++ b/classifier-e2e/configs/inference.yaml @@ -0,0 +1,12 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: "breast_cancer_classifier" + version: "production" \ No newline at end of file diff --git a/stack-showcase/configs/training.yaml b/classifier-e2e/configs/training_sgd.yaml similarity index 60% rename from stack-showcase/configs/training.yaml rename to classifier-e2e/configs/training_sgd.yaml index 3f3c02c2..d070395e 100644 --- a/stack-showcase/configs/training.yaml +++ b/classifier-e2e/configs/training_sgd.yaml @@ -5,18 +5,21 @@ settings: - sklearn requirements: - pyarrow - - huggingface_hub # configuration of the Model Control Plane model: name: breast_cancer_classifier license: Apache 2.0 - description: Classification of Breast Cancer Dataset. - tags: ["classification", "sklearn"] + description: A breast cancer classifier + tags: ["breast_cancer", "classifier","sgd"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between xgboost/sgd steps: model_trainer: settings: step_operator.sagemaker: estimator_args: - instance_type: "ml.m5.large" + instance_type : ml.m5.large \ No newline at end of file diff --git a/classifier-e2e/configs/training_sgd_sagemaker.yaml b/classifier-e2e/configs/training_sgd_sagemaker.yaml new file mode 100644 index 00000000..0bb6a841 --- /dev/null +++ b/classifier-e2e/configs/training_sgd_sagemaker.yaml @@ -0,0 +1,26 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier","sgd"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between rf/sgd + +steps: + model_trainer: + step_operator: sagemaker-eu + settings: + step_operator.sagemaker: + estimator_args: + instance_type : ml.m5.large \ No newline at end of file diff --git a/classifier-e2e/configs/training_xgboost.yaml b/classifier-e2e/configs/training_xgboost.yaml new file mode 100644 index 00000000..6796ea97 --- /dev/null +++ b/classifier-e2e/configs/training_xgboost.yaml @@ -0,0 +1,26 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + - xgboost + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier","xgboost"] + +# Configure the pipeline +parameters: + model_type: "xgboost" # Choose between sgd/xgboost + +steps: + model_trainer: + settings: + step_operator.sagemaker: + estimator_args: + instance_type : ml.m5.large \ No newline at end of file diff --git a/classifier-e2e/configs/training_xgboost_sagemaker.yaml b/classifier-e2e/configs/training_xgboost_sagemaker.yaml new file mode 100644 index 00000000..96e3d991 --- /dev/null +++ b/classifier-e2e/configs/training_xgboost_sagemaker.yaml @@ -0,0 +1,27 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + - xgboost + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier","xgboost"] + +# Configure the pipeline +parameters: + model_type: "xgboost" # Choose between sgd/xgboost + +steps: + model_trainer: + step_operator: sagemaker-eu + settings: + step_operator.sagemaker: + estimator_args: + instance_type : ml.m5.large \ No newline at end of file diff --git a/classifier-e2e/pipelines/__init__.py b/classifier-e2e/pipelines/__init__.py new file mode 100644 index 00000000..0faf2902 --- /dev/null +++ b/classifier-e2e/pipelines/__init__.py @@ -0,0 +1,21 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training +from .deploy import deploy \ No newline at end of file diff --git a/classifier-e2e/pipelines/deploy.py b/classifier-e2e/pipelines/deploy.py new file mode 100644 index 00000000..f9b77fc1 --- /dev/null +++ b/classifier-e2e/pipelines/deploy.py @@ -0,0 +1,25 @@ +from zenml import pipeline, get_pipeline_context +from steps import data_loader, inference_preprocessor +import random +from steps import deploy_endpoint, predict_on_endpoint, shutdown_endpoint + + +@pipeline +def deploy(shutdown_endpoint_after_predicting: bool = True): + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model.get_artifact( + "preprocess_pipeline" + ) + + df_inference = data_loader( + random_state=random.randint(0, 1000), is_inference=True + ) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target="target", + ) + predictor = deploy_endpoint() + predict_on_endpoint(predictor, df_inference) + if shutdown_endpoint_after_predicting: + shutdown_endpoint(predictor, after=["predict_on_endpoint"]) diff --git a/stack-showcase/pipelines/feature_engineering.py b/classifier-e2e/pipelines/feature_engineering.py similarity index 59% rename from stack-showcase/pipelines/feature_engineering.py rename to classifier-e2e/pipelines/feature_engineering.py index 46f5e0cd..0dcdbb74 100644 --- a/stack-showcase/pipelines/feature_engineering.py +++ b/classifier-e2e/pipelines/feature_engineering.py @@ -1,13 +1,29 @@ -# {% include 'template/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -import random from typing import List, Optional +import random from steps import ( data_loader, data_preprocessor, data_splitter, ) + from zenml import pipeline from zenml.logger import get_logger @@ -21,6 +37,7 @@ def feature_engineering( normalize: Optional[bool] = None, drop_columns: Optional[List[str]] = None, target: Optional[str] = "target", + random_state: int = None, ): """ Feature engineering pipeline. @@ -34,11 +51,16 @@ def feature_engineering( normalize: If `True` dataset will be normalized with MinMaxScaler drop_columns: List of columns to drop from dataset target: Name of target column in dataset + random_state: Random state to configure the data loader + + Returns: + The processed datasets (dataset_trn, dataset_tst). """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### # Link all the steps together by calling them and passing the output # of one step as the input of the next step. - raw_data = data_loader(random_state=random.randint(0, 100), target=target) + if random_state is None: + random_state = random.randint(0,1000) + raw_data = data_loader(random_state=random_state, target=target) dataset_trn, dataset_tst = data_splitter( dataset=raw_data, test_size=test_size, @@ -50,5 +72,6 @@ def feature_engineering( normalize=normalize, drop_columns=drop_columns, target=target, + random_state=random_state, ) return dataset_trn, dataset_tst diff --git a/classifier-e2e/pipelines/inference.py b/classifier-e2e/pipelines/inference.py new file mode 100644 index 00000000..46620c80 --- /dev/null +++ b/classifier-e2e/pipelines/inference.py @@ -0,0 +1,62 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from steps import ( + data_loader, + inference_predict, + inference_preprocessor, +) + +from zenml import get_pipeline_context, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference(random_state: str, target: str): + """ + Model inference pipeline. + + This is a pipeline that loads the inference data, processes it with + the same preprocessing pipeline used in training, and runs inference + with the trained model. + + Args: + random_state: Random state for reproducibility. + target: Name of target column in dataset. + """ + # Get the production model artifact + model = get_pipeline_context().model.get_artifact("breast_cancer_classifier") + + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model.get_artifact( + "preprocess_pipeline" + ) + + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target=target, + ) + inference_predict( + model=model, + dataset_inf=df_inference, + ) diff --git a/classifier-e2e/pipelines/training.py b/classifier-e2e/pipelines/training.py new file mode 100644 index 00000000..be95df32 --- /dev/null +++ b/classifier-e2e/pipelines/training.py @@ -0,0 +1,82 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_promoter, model_trainer + +from pipelines import ( + feature_engineering, +) +from zenml import pipeline +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + target: Optional[str] = "target", + model_type: Optional[str] = "sgd", + random_state: int = None, +): + """ + Model training pipeline. + + This is a pipeline that loads the data from a preprocessing pipeline, + trains a model on it and evaluates the model. If it is the first model + to be trained, it will be promoted to production. If not, it will be + promoted only if it has a higher accuracy than the current production + model version. + + Args: + train_dataset_id: ID of the train dataset produced by feature engineering. + test_dataset_id: ID of the test dataset produced by feature engineering. + target: Name of target column in dataset. + model_type: The type of model to train. + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering(random_state=random_state) + else: + client = Client() + dataset_trn = client.get_artifact_version( + name_id_or_prefix=train_dataset_id + ) + dataset_tst = client.get_artifact_version( + name_id_or_prefix=test_dataset_id + ) + + model = model_trainer( + dataset_trn=dataset_trn, target=target, model_type=model_type + ) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + target=target, + ) + + model_promoter(accuracy=acc) diff --git a/stack-showcase/requirements.txt b/classifier-e2e/requirements.txt similarity index 51% rename from stack-showcase/requirements.txt rename to classifier-e2e/requirements.txt index e97bcdf6..c3fd2c34 100644 --- a/stack-showcase/requirements.txt +++ b/classifier-e2e/requirements.txt @@ -1,11 +1,11 @@ -zenml[server]>=0.50.0 +zenml[server]>=0.55.2 notebook scikit-learn<1.3 s3fs>2022.3.0,<=2023.4.0 boto3<=1.26.76 aws-profile-manager -mlflow>=2.1.1,<=2.9.2 -mlserver>=1.3.3 -mlserver-mlflow>=1.3.3 sagemaker==2.117.0 -huggingface_hub \ No newline at end of file +pyarrow +wandb +seaborn +xgboost \ No newline at end of file diff --git a/stack-showcase/run.py b/classifier-e2e/run.py similarity index 52% rename from stack-showcase/run.py rename to classifier-e2e/run.py index f8f2a21a..68033d98 100644 --- a/stack-showcase/run.py +++ b/classifier-e2e/run.py @@ -1,15 +1,32 @@ -# {% include 'templates/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import os +import random from typing import Optional import click +import yaml from pipelines import ( feature_engineering, inference, - breast_cancer_training, - breast_cancer_deployment_pipeline + training, ) + from zenml.client import Client from zenml.logger import get_logger @@ -18,7 +35,7 @@ @click.command( help=""" -ZenML Starter project CLI v0.0.1. +ZenML Starter project. Run the ZenML starter project with basic options. @@ -68,12 +85,6 @@ help="Version of the test dataset produced by feature engineering. " "If not specified, a new version will be created.", ) -@click.option( - "--config", - default=None, - type=click.STRING, - help="The name of the config", -) @click.option( "--feature-pipeline", is_flag=True, @@ -93,21 +104,27 @@ help="Whether to run the pipeline that performs inference.", ) @click.option( - "--deployment-pipeline", + "--custom-training-suffix", + default="", + type=click.STRING, + help="Suffix to append to the training pipeline name.", +) +@click.option( + "--no-cache", is_flag=True, default=False, - help="Whether to run the pipeline that deploys the model.", + help="Disable caching for the pipeline run.", ) def main( train_dataset_name: str = "dataset_trn", train_dataset_version_name: Optional[str] = None, test_dataset_name: str = "dataset_tst", test_dataset_version_name: Optional[str] = None, - config: Optional[str] = None, feature_pipeline: bool = False, training_pipeline: bool = False, inference_pipeline: bool = False, - deployment_pipeline: bool = False, + custom_training_suffix: str = "", + no_cache: bool = False, ): """Main entry point for the pipeline execution. @@ -117,22 +134,39 @@ def main( (some of which may come from command line arguments, but most of which comes from the YAML config files) * launching the pipeline + + Args: + train_dataset_name: The name of the train dataset produced by feature engineering. + train_dataset_version_name: Version of the train dataset produced by feature engineering. + If not specified, a new version will be created. + test_dataset_name: The name of the test dataset produced by feature engineering. + test_dataset_version_name: Version of the test dataset produced by feature engineering. + If not specified, a new version will be created. + feature_pipeline: Whether to run the pipeline that creates the dataset. + training_pipeline: Whether to run the pipeline that trains the model. + inference_pipeline: Whether to run the pipeline that performs inference. + custom_training_suffix: The suffix to append to the pipeline configuration file when training. + no_cache: If `True` cache will be disabled. """ + client = Client() + config_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "configs", ) - client = Client() # Execute Feature Engineering Pipeline if feature_pipeline: pipeline_args = {} + if no_cache: + pipeline_args["enable_cache"] = False pipeline_args["config_path"] = os.path.join( config_folder, "feature_engineering.yaml" ) run_args_feature = {} feature_engineering.with_options(**pipeline_args)(**run_args_feature) - logger.info("Feature Engineering pipeline finished successfully!") + logger.info("Feature Engineering pipeline finished successfully!\n") + train_dataset_artifact = client.get_artifact_version( train_dataset_name ) @@ -143,14 +177,9 @@ def main( f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: " f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}" ) - + # Execute Training Pipeline if training_pipeline: - pipeline_args = {} - if config is None: - pipeline_args["config_path"] = os.path.join(config_folder, "training.yaml") - else: - pipeline_args["config_path"] = os.path.join(config_folder, config) run_args_train = {} # If train_dataset_version_name is specified, use versioned artifacts @@ -160,36 +189,76 @@ def main( train_dataset_version_name is not None and test_dataset_version_name is not None ) - train_dataset_artifact = client.get_artifact_version( + train_dataset_artifact_version = client.get_artifact_version( train_dataset_name, train_dataset_version_name ) # If train dataset is specified, test dataset must be specified - test_dataset_artifact = client.get_artifact_version( + test_dataset_artifact_version = client.get_artifact_version( test_dataset_name, test_dataset_version_name ) # Use versioned artifacts - run_args_train["train_dataset_id"] = train_dataset_artifact.id - run_args_train["test_dataset_id"] = test_dataset_artifact.id + run_args_train[ + "train_dataset_id" + ] = train_dataset_artifact_version.id + run_args_train[ + "test_dataset_id" + ] = test_dataset_artifact_version.id - breast_cancer_training.with_options(**pipeline_args)(**run_args_train) - logger.info("Training pipeline finished successfully!") + run_args_train["random_state"] = random.randint(0,1000) - if inference_pipeline: + # Run the SGD pipeline pipeline_args = {} - if config is None: - pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml") - else: - pipeline_args["config_path"] = os.path.join(config_folder, config) - run_args_inference = {} - inference.with_options(**pipeline_args)(**run_args_inference) - logger.info("Inference pipeline finished successfully!") + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, f"training_sgd{custom_training_suffix}.yaml" + ) + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with SGD finished successfully!\n\n") - if deployment_pipeline: + # Run the RF pipeline pipeline_args = {} - pipeline_args["config_path"] = os.path.join(config_folder, "deployment.yaml") + if no_cache: + pipeline_args["enable_cache"] = False + pipeline_args["config_path"] = os.path.join( + config_folder, f"training_xgboost{custom_training_suffix}.yaml" + ) + training.with_options(**pipeline_args)(**run_args_train) + logger.info("Training pipeline with XGBoost finished successfully!\n\n") + + if inference_pipeline: run_args_inference = {} - breast_cancer_deployment_pipeline.with_options(**pipeline_args)(**run_args_inference) - logger.info("Deployment pipeline finished successfully!") + pipeline_args = {"enable_cache": False} + pipeline_args["config_path"] = os.path.join( + config_folder, "inference.yaml" + ) + + # Configure the pipeline + inference_configured = inference.with_options(**pipeline_args) + + # Fetch the production model + with open(pipeline_args["config_path"], "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + zenml_model = client.get_model_version( + config["model"]["name"], config["model"]["version"] + ) + preprocess_pipeline_artifact = zenml_model.get_artifact( + "preprocess_pipeline" + ) + + # Use the metadata of feature engineering pipeline artifact + # to get the random state and target column + random_state = preprocess_pipeline_artifact.run_metadata[ + "random_state" + ].value + target = preprocess_pipeline_artifact.run_metadata["target"].value + run_args_inference["random_state"] = random_state + run_args_inference["target"] = target + + # Run the pipeline + inference_configured(**run_args_inference) + logger.info("Inference pipeline finished successfully!") + if __name__ == "__main__": main() diff --git a/classifier-e2e/run_full.ipynb b/classifier-e2e/run_full.ipynb new file mode 100644 index 00000000..a83bf505 --- /dev/null +++ b/classifier-e2e/run_full.ipynb @@ -0,0 +1,1144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63f7ab34", + "metadata": {}, + "source": [ + "# ๐ŸŒ Overview\n", + "\n", + "This demo is a minimalistic MLOps project intended to showcase how to put ML workflows in production. It features: \n", + "\n", + "- A feature engineering pipeline that loads data and prepares it for training.\n", + "- A training pipeline that loads the preprocessed dataset and trains a model.\n", + "- A batch inference pipeline that runs predictions on the trained model with new data.\n", + "- A stack switching and leveraging of Sagemaker step operator to outsource training to Cloud\n", + "- An analysis of training artifacts and their lineage (including connection with W&B)\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "66b2977c", + "metadata": {}, + "source": [ + "# ๐Ÿ‘ถ Step 0. Install Requirements\n", + "\n", + "Let's install ZenML to get started. First we'll install the latest version of\n", + "ZenML as well as the `sklearn` and `xgboost` integration of ZenML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f76f562e", + "metadata": {}, + "outputs": [], + "source": [ + "! pip3 install -r requirements.txt\n", + "! zenml integration install sklearn xgboost -y\n", + "! zenml connect --url https://1cf18d95-zenml.cloudinfra.zenml.io \n", + "\n", + "import IPython\n", + "IPython.Application.instance().kernel.do_shutdown(restart=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3b044374", + "metadata": {}, + "source": [ + "Please wait for the installation to complete before running subsequent cells. At\n", + "the end of the installation, the notebook kernel will automatically restart." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "081d5616", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize ZenML and set the default stack\n", + "!zenml init\n", + "!zenml stack set local-sagemaker-step-operator-wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f775f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do the imports at the top\n", + "from typing_extensions import Annotated\n", + "from sklearn.datasets import load_breast_cancer\n", + "\n", + "import pandas as pd\n", + "from zenml import step, pipeline, Model, get_step_context\n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "from uuid import UUID\n", + "\n", + "from zenml import pipeline\n", + "\n", + "from steps import (\n", + " data_loader,\n", + " inference_preprocessor\n", + ")\n", + "from pipelines import feature_engineering, training\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "# Initialize the ZenML client to fetch objects from the ZenML Server\n", + "client = Client()" + ] + }, + { + "cell_type": "markdown", + "id": "35e48460", + "metadata": {}, + "source": [ + "## ๐Ÿฅ‡ Step 1: Load your data and execute feature engineering\n", + "\n", + "We'll start off by importing our data. In this quickstart we'll be working with\n", + "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n", + "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n", + "problem, to predict whether a patient is diagnosed with breast cancer or not.\n", + "\n", + "When you're getting started with a machine learning problem you'll want to do\n", + "something similar to this: import your data and get it in the right shape for\n", + "your training. ZenML mostly gets out of your way when you're writing your Python\n", + "code, as you'll see from the following cell.\n", + "\n", + "\"Feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cd974d1", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def data_loader_simplified(\n", + " random_state: int, is_inference: bool = False, target: str = \"target\"\n", + ") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset \n", + " \"\"\"Dataset reader step.\"\"\"\n", + " dataset = load_breast_cancer(as_frame=True)\n", + " inference_size = int(len(dataset.target) * 0.05)\n", + " dataset: pd.DataFrame = dataset.frame\n", + " inference_subset = dataset.sample(inference_size, random_state=random_state)\n", + " if is_inference:\n", + " dataset = inference_subset\n", + " dataset.drop(columns=target, inplace=True)\n", + " else:\n", + " dataset.drop(inference_subset.index, inplace=True)\n", + " dataset.reset_index(drop=True, inplace=True)\n", + " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", + " return dataset\n" + ] + }, + { + "cell_type": "markdown", + "id": "1e8ba4c6", + "metadata": {}, + "source": [ + "The whole function is decorated with the `@step` decorator, which\n", + "tells ZenML to track this function as a step in the pipeline. This means that\n", + "ZenML will automatically version, track, and cache the data that is produced by\n", + "this function as an `artifact`. This is a very powerful feature, as it means that you can\n", + "reproduce your data at any point in the future, even if the original data source\n", + "changes or disappears. \n", + "\n", + "Note the use of the `typing` module's `Annotated` type hint in the output of the\n", + "step. We're using this to give a name to the output of the step, which will make\n", + "it possible to access it via a keyword later on.\n", + "\n", + "You'll also notice that we have included type hints for the outputs\n", + "to the function. These are not only useful for anyone reading your code, but\n", + "help ZenML process your data in a way appropriate to the specific data types." + ] + }, + { + "cell_type": "markdown", + "id": "b6286b67", + "metadata": {}, + "source": [ + "ZenML is built in a way that allows you to experiment with your data and build\n", + "your pipelines as you work, so if you want to call this function to see how it\n", + "works, you can just call it directly. Here we take a look at the first few rows\n", + "of your training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d838e2ea", + "metadata": {}, + "outputs": [], + "source": [ + "df = data_loader_simplified(random_state=42)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "28c05291", + "metadata": {}, + "source": [ + "Everything looks as we'd expect and the values are all in the right format ๐Ÿฅณ.\n", + "\n", + "We're now at the point where can bring this step (and some others) together into a single\n", + "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n", + "as simple as adding a `@pipeline` decorator to a function. This specific\n", + "pipeline doesn't return a value, but that option is available to you if you need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b50a9537", + "metadata": {}, + "outputs": [], + "source": [ + "# let's see how feature engineering pipeline is implemented\n", + "%pycat pipelines/feature_engineering.py" + ] + }, + { + "cell_type": "markdown", + "id": "7cd73c23", + "metadata": {}, + "source": [ + "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", + "pipeline function itself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0aa9af", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(random_state=42)" + ] + }, + { + "cell_type": "markdown", + "id": "1785c303", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size, to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "658c0570-2607-4b97-a72d-d45c92633e48", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(random_state=42,test_size=0.25)" + ] + }, + { + "cell_type": "markdown", + "id": "64bb7206", + "metadata": {}, + "source": [ + "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n", + "This is because ZenML automatically determined that nothing had changed in the data loader step, \n", + "so it didn't need to rerun it." + ] + }, + { + "cell_type": "markdown", + "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf", + "metadata": {}, + "source": [ + "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e1d8546", + "metadata": {}, + "outputs": [], + "source": [ + "feature_engineering(test_size=0.25, random_state=104)" + ] + }, + { + "cell_type": "markdown", + "id": "e8471f93", + "metadata": {}, + "source": [ + "We can also fetch the pipeline from the server and view the results directly in the notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f208b200", + "metadata": {}, + "outputs": [], + "source": [ + "run = client.get_pipeline(\"feature_engineering\").last_run\n", + "print(run.name)" + ] + }, + { + "cell_type": "markdown", + "id": "a037f09d", + "metadata": {}, + "source": [ + "We can also see the data artifacts that were produced by the last step of the pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34283e89", + "metadata": {}, + "outputs": [], + "source": [ + "run.steps[\"data_preprocessor\"].outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bceb0312", + "metadata": {}, + "outputs": [], + "source": [ + "# Read one of the datasets. This is the one with a 0.25 test split\n", + "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()" + ] + }, + { + "cell_type": "markdown", + "id": "26d26436", + "metadata": {}, + "source": [ + "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n", + "\n", + "You can fetch these artifact and their versions using the `client`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f90647", + "metadata": {}, + "outputs": [], + "source": [ + "# Get artifact version from our run\n", + "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"] \n", + "\n", + "# Get latest version from client directly\n", + "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n", + "\n", + "# This should be true if our run is the latest run and no artifact has been produced\n", + "# in the intervening time\n", + "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f9d3dfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch the rest of the artifacts\n", + "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n", + "preprocessing_pipeline_artifact_version = client.get_artifact_version(\"preprocess_pipeline\")" + ] + }, + { + "cell_type": "markdown", + "id": "7a7d1b04", + "metadata": {}, + "source": [ + "If you started with a fresh install, then you would have two versions corresponding\n", + "to the two pipelines that we ran above. We can even load a artifact version in memory: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c82aca75", + "metadata": {}, + "outputs": [], + "source": [ + "# Load an artifact to verify you can fetch it\n", + "dataset_trn_artifact_version.load()" + ] + }, + { + "cell_type": "markdown", + "id": "5963509e", + "metadata": {}, + "source": [ + "We'll use these artifacts from above in our next pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "8c28b474", + "metadata": {}, + "source": [ + "# โŒš Step 2: Training pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "87909827", + "metadata": {}, + "source": [ + "Now that we have our data it makes sense to train some models to get a sense of\n", + "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n", + "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n", + "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n", + "\n", + "We'll start with two simple models, a SGD Classifier and a Random Forest\n", + "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", + "same data and then compare their performance.\n", + "\n", + "\"Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fccf1bd9", + "metadata": {}, + "outputs": [], + "source": [ + "# let's have a look at training pipeline\n", + "%pycat steps/inference_predict.py" + ] + }, + { + "cell_type": "markdown", + "id": "73a00008", + "metadata": {}, + "source": [ + "Our two training steps both return different kinds of classifier\n", + "models, so we use the generic `ClassifierMixin` type hint for the return type." + ] + }, + { + "cell_type": "markdown", + "id": "a5f22174", + "metadata": {}, + "source": [ + "ZenML allows you to load any version of any dataset that is tracked by the framework\n", + "directly into a pipeline using the `Client().get_artifact_version` interface. This is very convenient\n", + "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n", + "into the training pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa98f2f", + "metadata": {}, + "outputs": [], + "source": [ + "# let's have a look at training pipeline\n", + "%pycat pipelines/inference.py" + ] + }, + { + "cell_type": "markdown", + "id": "88b70fd3", + "metadata": {}, + "source": [ + "The end goal of this quick baseline evaluation is to understand which of the two\n", + "models performs better. We'll use the `evaluator` step to compare the two\n", + "models. This step takes in the model from the trainer step, and computes its score\n", + "over the testing set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c64885ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a random forest model with the chosen datasets.\n", + "# We need to pass the ID's of the datasets into the function\n", + "training.with_options(enable_cache=False)(\n", + " model_type=\"xgboost\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "xgboost_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4300c82f", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a SGD classifier\n", + "sgd_run = training.with_options(enable_cache=False)(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")\n", + "\n", + "sgd_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "markdown", + "id": "43f1a68a", + "metadata": {}, + "source": [ + "You can see from the logs already how our model training went: the\n", + "`XGBClassifier` performed considerably better than the `SGDClassifier`.\n", + "We can use the ZenML `Client` to verify this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95810b1", + "metadata": {}, + "outputs": [], + "source": [ + "# The evaluator returns a float value with the accuracy\n", + "xgboost_run.steps[\"model_evaluator\"].output.load() >= sgd_run.steps[\"model_evaluator\"].output.load()" + ] + }, + { + "cell_type": "markdown", + "id": "e256d145", + "metadata": {}, + "source": [ + "# ๐Ÿ’ฏ Step 3: Associating a model with your pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "927978f3", + "metadata": {}, + "source": [ + "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n", + "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n", + "which is a central register of all your ML models.\n", + "\n", + "You can easily create a ZenML Model and associate it with your pipelines using the `Model` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ca00c0", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model\"] = Model(\n", + " name=\"breast_cancer_classifier\",\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e78a520", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the SGD model and tag the version name with \"sgd\"\n", + "pipeline_settings[\"model\"].tags = [\"breast_cancer\", \"classifier\", \"sgd\"]\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# We can now run this as usual\n", + "training_configured(\n", + " model_type=\"sgd\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b8e0002", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the RF model and tag the version name with \"xgboost\"\n", + "pipeline_settings[\"model\"].tags = [\"breast_cancer\", \"classifier\", \"xgboost\"]\n", + "\n", + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "training_configured = training.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "training_configured(\n", + " model_type=\"xgboost\",\n", + " train_dataset_id=dataset_trn_artifact_version.id,\n", + " test_dataset_id=dataset_tst_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "09597223", + "metadata": {}, + "source": [ + "This time, running both pipelines has created two associated **model versions**.\n", + "You can list your ZenML model and their versions as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb25913", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_model = client.get_model(\"breast_cancer_classifier\")\n", + "print(zenml_model)\n", + "\n", + "versions = zenml_model.versions\n", + "\n", + "print(f\"Model {zenml_model.name} has {len(versions)} versions\")\n", + "\n", + "versions[-2].version, versions[-1].version" + ] + }, + { + "cell_type": "markdown", + "id": "e82cfac2", + "metadata": {}, + "source": [ + "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n", + "pipelines to that model version, including the two pickle files that represent our\n", + "SGD and RandomForest classifier. We can see all artifacts directly from the model\n", + "version object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31211413", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load the XGBoost version\n", + "xgboost_zenml_model_version = client.list_model_versions(\"breast_cancer_classifier\", tag=\"xgboost\")[-1]\n", + "\n", + "# We can now load our classifier directly as well\n", + "xgboost_classifier = xgboost_zenml_model_version.get_artifact(\"breast_cancer_classifier\").load()\n", + "\n", + "xgboost_classifier" + ] + }, + { + "cell_type": "markdown", + "id": "53517a9a", + "metadata": {}, + "source": [ + "If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "eb645dde", + "metadata": {}, + "source": [ + "There is a lot more you can do with ZenML models, including the ability to\n", + "track metrics by adding metadata to it, or having them persist in a model\n", + "registry. However, these topics can be explored more in the\n", + "[ZenML docs](https://docs.zenml.io).\n", + "\n", + "For now, we will use the ZenML model control plane to promote our best\n", + "model to `production`. You can do this by simply setting the `stage` of\n", + "your chosen model version to the `production` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b718f8", + "metadata": {}, + "outputs": [], + "source": [ + "# Set our best classifier to production\n", + "xgboost_zenml_model_version.set_stage(\"production\", force=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9fddf3d0", + "metadata": {}, + "source": [ + "Of course, normally one would only promote the model by comparing to all other model\n", + "versions and doing some other tests. But that's a bit more advanced use-case. See the\n", + "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n", + "more insight into that sort of flow!" + ] + }, + { + "cell_type": "markdown", + "id": "2ecbc8cf", + "metadata": {}, + "source": [ + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "8f1146db", + "metadata": {}, + "source": [ + "Once the model is promoted, we can now consume the right model version in our\n", + "batch inference pipeline directly. Let's see how that works." + ] + }, + { + "cell_type": "markdown", + "id": "d6306f14", + "metadata": {}, + "source": [ + "# ๐Ÿซ… Step 4: Consuming the model in production" + ] + }, + { + "cell_type": "markdown", + "id": "b51f3108", + "metadata": {}, + "source": [ + "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", + "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", + "and generate predictions:\n", + "\n", + "\"Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c4c7dc", + "metadata": {}, + "outputs": [], + "source": [ + "@step\n", + "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n", + " \"\"\"Predictions step\"\"\"\n", + " # Get the model\n", + " model = get_step_context().model\n", + "\n", + " # run prediction from memory\n", + " predictor = model.load_artifact(\"breast_cancer_classifier\")\n", + " predictions = predictor.predict(dataset_inf)\n", + "\n", + " predictions = pd.Series(predictions, name=\"predicted\")\n", + "\n", + " return predictions\n" + ] + }, + { + "cell_type": "markdown", + "id": "3aeb227b", + "metadata": {}, + "source": [ + "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n", + "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37c409bd", + "metadata": {}, + "outputs": [], + "source": [ + "@pipeline\n", + "def inference(preprocess_pipeline_id: UUID):\n", + " \"\"\"Model batch inference pipeline\"\"\"\n", + " # random_state = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).metadata[\"random_state\"].value\n", + " # target = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).run_metadata['target'].value\n", + " random_state = 42\n", + " target = \"target\"\n", + "\n", + " df_inference = data_loader(\n", + " random_state=random_state, is_inference=True\n", + " )\n", + " df_inference = inference_preprocessor(\n", + " dataset_inf=df_inference,\n", + " # We use the preprocess pipeline from the feature engineering pipeline\n", + " preprocess_pipeline=client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id),\n", + " target=target,\n", + " )\n", + " inference_predict(\n", + " dataset_inf=df_inference,\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7afe7be", + "metadata": {}, + "source": [ + "The way to load the right model is to pass in the `production` stage into the `Model` config this time.\n", + "This will ensure to always load the production model, decoupled from all other pipelines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf5939", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {\"enable_cache\": False}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model\"] = Model(\n", + " name=\"breast_cancer_classifier\",\n", + " version=\"production\", # We can pass in the stage name here!\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff3402f1", + "metadata": {}, + "outputs": [], + "source": [ + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "inference_configured = inference.with_options(**pipeline_settings)\n", + "\n", + "# Let's run it again to make sure we have two versions\n", + "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n", + "# in order to avoid training-serving skew\n", + "inference_configured(\n", + " preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2935d1fa", + "metadata": {}, + "source": [ + "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n", + "that were returned in the pipeline. This completes the MLOps loop of training to inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e191d019", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch production model\n", + "production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n", + "\n", + "# Get the predictions artifact\n", + "production_model_version.get_artifact(\"predictions\").load()" + ] + }, + { + "cell_type": "markdown", + "id": "b0a73cdf", + "metadata": {}, + "source": [ + "You can also see all predictions ever created as a complete history in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "f9525f15", + "metadata": {}, + "source": [ + "# ๐Ÿ™ Step 5: Analyzing results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d127837", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.client import Client\n", + "client = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2728fa4d", + "metadata": {}, + "outputs": [], + "source": [ + "sgd_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"sgd\")[-1]\n", + "xgboost_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"xgboost\")[-1]\n", + "print(f\"SGD version is staged as `{sgd_model_version.stage}`\")\n", + "print(f\"XGBoost version is staged as `{xgboost_model_version.stage}`\")" + ] + }, + { + "cell_type": "markdown", + "id": "fc9574f9", + "metadata": {}, + "source": [ + "At first, let's pull some meta information collected during models evaluation stage. To recall we used this step as evaluator:\n", + "```python\n", + "@step\n", + "def model_evaluator(\n", + " model: ClassifierMixin,\n", + " dataset_trn: pd.DataFrame,\n", + " dataset_tst: pd.DataFrame,\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + " target: Optional[str] = \"target\",\n", + ") -> float:\n", + " # Calculate the model accuracy on the train and test set\n", + " trn_acc = model.score(...)\n", + " tst_acc = model.score(...)\n", + "\n", + " ...\n", + " \n", + " predictions = model.predict(dataset_tst.drop(columns=[target]))\n", + " metadata = {\n", + " \"train_accuracy\": float(trn_acc),\n", + " \"test_accuracy\": float(tst_acc),\n", + " \"confusion_matrix\": confusion_matrix(dataset_tst[target], predictions)\n", + " .ravel()\n", + " .tolist(),\n", + " }\n", + " log_model_metadata(metadata={\"wandb_url\": wandb.run.url})\n", + " log_artifact_metadata(\n", + " metadata=metadata,\n", + " artifact_name=\"breast_cancer_classifier\",\n", + " )\n", + "\n", + " wandb.log({\"train_accuracy\": metadata[\"train_accuracy\"]})\n", + " wandb.log({\"test_accuracy\": metadata[\"test_accuracy\"]})\n", + " wandb.log(\n", + " {\n", + " \"confusion_matrix\": wandb.sklearn.plot_confusion_matrix(\n", + " dataset_tst[target], predictions, [\"No Cancer\", \"Cancer\"]\n", + " )\n", + " }\n", + " )\n", + " return float(tst_acc)\n", + "```\n", + "First we pull Accuracy metrics out of both model version for comparison:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795e5b2d", + "metadata": {}, + "outputs": [], + "source": [ + "sgd_clf_metadata = sgd_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n", + "xgboost_clf_metadata = xgboost_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n", + "print(f\"SGD{' (production)' if sgd_model_version.stage == 'production' else ''} metrics: train={sgd_clf_metadata['train_accuracy'].value*100:.2f}% test={sgd_clf_metadata['test_accuracy'].value*100:.2f}%\")\n", + "print(f\"XGBoost{' (production)' if xgboost_model_version.stage == 'production' else ''} metrics: train={xgboost_clf_metadata['train_accuracy'].value*100:.2f}% test={xgboost_clf_metadata['test_accuracy'].value*100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d5f9c87", + "metadata": {}, + "source": [ + "Now lets' plot collected Confusion Matrixes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4c1e8a", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_confusion_matrix(metadata_pointer, tp: str,ax):\n", + " confusion_matrix = np.array(metadata_pointer[\"confusion_matrix\"].value, dtype=float).reshape((2,2))\n", + " confusion_matrix /= np.sum(confusion_matrix)\n", + " sns.heatmap(confusion_matrix, annot=True,fmt='.2%',cmap=\"coolwarm\",ax=ax)\n", + " ax.set_title(f\"{tp} confusion matrix\")\n", + " ax.set_ylabel(\"Ground Label\")\n", + " ax.set_xlabel(\"Predicted Label\")\n", + "\n", + "fig, ax = plt.subplots(1,2,figsize=(15,4))\n", + "plot_confusion_matrix(sgd_clf_metadata, \"SGD\",ax[0])\n", + "plot_confusion_matrix(xgboost_clf_metadata, \"RF\",ax[1])" + ] + }, + { + "cell_type": "markdown", + "id": "5d37260e", + "metadata": {}, + "source": [ + "So far we were able to collect all the information we tracked using Model Control Plane, but we also had Weights&Biases tracking enabled - let's dive into.\n", + "\n", + "Thanks to Model Control Plane metadata we establish a nice connection between those 2 entities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9288deaf", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'SGD version: {sgd_model_version.run_metadata[\"wandb_url\"].value}')\n", + "print(f'RF version: {xgboost_model_version.run_metadata[\"wandb_url\"].value}')" + ] + }, + { + "cell_type": "markdown", + "id": "9743aff9", + "metadata": {}, + "source": [ + "With Model Control Plane we can also easily track lineage of artifacts and pipeline runs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4904337b", + "metadata": {}, + "outputs": [], + "source": [ + "for artifact_name, versions in sgd_model_version.data_artifacts.items():\n", + " if versions:\n", + " print(f\"Existing version of `{artifact_name}`:\")\n", + " for version_name, artifact_ in versions.items():\n", + " print(version_name, artifact_.data_type.attribute)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1b036db", + "metadata": {}, + "outputs": [], + "source": [ + "for run_name, run_ in sgd_model_version.pipeline_runs.items():\n", + " print(run_name, run_.id)" + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "# ๐Ÿ™ Step 5: Moving to production\n", + "\n", + "Let's run all the moving pieces we navigated in the previous steps using production ready python script `run.py`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set local-wandb\n", + "!zenml stack describe local-wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python3 run.py --training-pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now full run executed on local stack and experiment is tracked using Model Control Plane and Weights&Biases.\n", + "\n", + "Let's move some heavy lifting to the Sagemaker. This can be achieved using Sagemaker orchestrator.\n", + "\n", + "\"Sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set sagemaker-pipelines-wandb\n", + "!zenml stack describe sagemaker-pipelines-wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python3 run.py --training-pipeline" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/classifier-e2e/run_skip_basics.ipynb b/classifier-e2e/run_skip_basics.ipynb new file mode 100644 index 00000000..7426d91e --- /dev/null +++ b/classifier-e2e/run_skip_basics.ipynb @@ -0,0 +1,1219 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63ab391a", + "metadata": {}, + "source": [ + "# ๐ŸŒ Overview\n", + "\n", + "This demo is a minimalistic MLOps project intended to showcase how to put ML workflows in production. It features: \n", + "\n", + "- A feature engineering pipeline that loads data and prepares it for training.\n", + "- A training pipeline that loads the preprocessed dataset and trains a model.\n", + "- A batch inference pipeline that runs predictions on the trained model with new data.\n", + "- A stack switching and leveraging of Sagemaker step operator to outsource training to Cloud\n", + "- An analysis of training artifacts and their lineage (including connection with W&B)\n", + "\n", + "\"Pipelines" + ] + }, + { + "cell_type": "markdown", + "id": "66b2977c", + "metadata": {}, + "source": [ + "# ๐Ÿ‘ถ Step 0. Install Requirements\n", + "\n", + "Let's install ZenML to get started. First we'll install the latest version of\n", + "ZenML as well as the `sklearn` and `xgboost` integration of ZenML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f76f562e", + "metadata": {}, + "outputs": [], + "source": [ + "! pip3 install -r requirements.txt\n", + "! zenml integration install sklearn xgboost -y\n", + "! zenml connect --url https://1cf18d95-zenml.cloudinfra.zenml.io \n", + "\n", + "import IPython\n", + "IPython.Application.instance().kernel.do_shutdown(restart=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "081d5616", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize ZenML and set the default stack\n", + "!zenml init\n", + "!zenml stack set local-sagemaker-step-operator-wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f775f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Do the imports at the top\n", + "from zenml import Model\n", + "from zenml.client import Client\n", + "from zenml.logger import get_logger\n", + "\n", + "from pipelines import training, inference\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "# Initialize the ZenML client to fetch objects from the ZenML Server\n", + "client = Client()" + ] + }, + { + "cell_type": "markdown", + "id": "8c28b474", + "metadata": {}, + "source": [ + "# โŒš Step 1: Training pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "87909827", + "metadata": {}, + "source": [ + "Now that we have our data it makes sense to train some models to get a sense of\n", + "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n", + "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n", + "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n", + "\n", + "We'll start with two simple models, a SGD Classifier and a Random Forest\n", + "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", + "same data and then compare their performance.\n", + "\n", + "\"Training" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dc08aecc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31m# Apache Software License 2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Copyright (c) ZenML GmbH 2024. All rights reserved.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# you may not use this file except in compliance with the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# You may obtain a copy of the License at\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# http://www.apache.org/licenses/LICENSE-2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Unless required by applicable law or agreed to in writing, software\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# distributed under the License is distributed on an \"AS IS\" BASIS,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# See the License for the specific language governing permissions and\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# limitations under the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mtyping\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mClassifierMixin\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_model\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSGDClassifier\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mtyping_extensions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mArtifactConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_materializer\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSagemakerMaterializer\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_materializers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSagemakerMaterializer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mmodel_trainer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_type\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"sgd\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"target\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mClassifierMixin\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mArtifactConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"breast_cancer_classifier\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_model_artifact\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"Configure and train a model on the training dataset.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m This is an example of a model training step that takes in a dataset artifact\u001b[0m\n", + "\u001b[0;34m previously loaded and pre-processed by other steps in your pipeline, then\u001b[0m\n", + "\u001b[0;34m configures and trains a model on it. The model is then returned as a step\u001b[0m\n", + "\u001b[0;34m output artifact.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Args:\u001b[0m\n", + "\u001b[0;34m dataset_trn: The preprocessed train dataset.\u001b[0m\n", + "\u001b[0;34m model_type: The type of model to train.\u001b[0m\n", + "\u001b[0;34m target: The name of the target column in the dataset.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Returns:\u001b[0m\n", + "\u001b[0;34m The trained model artifact.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Raises:\u001b[0m\n", + "\u001b[0;34m ValueError: If the model type is not supported.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Initialize the model with the hyperparameters indicated in the step\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# parameters and train it on the training set.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmodel_type\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"sgd\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSGDClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmodel_type\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"xgboost\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mXGBClassifier\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mXGBClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Unknown model type {model_type}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Training model {model}...\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "# let's have a look at model training step\n", + "%pycat steps/model_trainer.py" + ] + }, + { + "cell_type": "markdown", + "id": "73a00008", + "metadata": {}, + "source": [ + "Our two training steps both return different kinds of classifier\n", + "models, so we use the generic `ClassifierMixin` type hint for the return type." + ] + }, + { + "cell_type": "markdown", + "id": "a5f22174", + "metadata": {}, + "source": [ + "ZenML allows you to load any version of any dataset that is tracked by the framework\n", + "directly into a pipeline using the `Client().get_artifact_version` interface. This is very convenient\n", + "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n", + "into the training pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "01162d23", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31m# Apache Software License 2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Copyright (c) ZenML GmbH 2024. All rights reserved.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# you may not use this file except in compliance with the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# You may obtain a copy of the License at\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# http://www.apache.org/licenses/LICENSE-2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Unless required by applicable law or agreed to in writing, software\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# distributed under the License is distributed on an \"AS IS\" BASIS,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# See the License for the specific language governing permissions and\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# limitations under the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mtyping\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0muuid\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mUUID\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msteps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmodel_evaluator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_promoter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_trainer\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mpipelines\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mfeature_engineering\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mClient\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mtraining\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtrain_dataset_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUUID\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtest_dataset_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUUID\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"target\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_type\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"sgd\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Model training pipeline.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m This is a pipeline that loads the data from a preprocessing pipeline,\u001b[0m\n", + "\u001b[0;34m trains a model on it and evaluates the model. If it is the first model\u001b[0m\n", + "\u001b[0;34m to be trained, it will be promoted to production. If not, it will be\u001b[0m\n", + "\u001b[0;34m promoted only if it has a higher accuracy than the current production\u001b[0m\n", + "\u001b[0;34m model version.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Args:\u001b[0m\n", + "\u001b[0;34m train_dataset_id: ID of the train dataset produced by feature engineering.\u001b[0m\n", + "\u001b[0;34m test_dataset_id: ID of the test dataset produced by feature engineering.\u001b[0m\n", + "\u001b[0;34m target: Name of target column in dataset.\u001b[0m\n", + "\u001b[0;34m model_type: The type of model to train.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Link all the steps together by calling them and passing the output\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# of one step as the input of the next step.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Execute Feature Engineering Pipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtrain_dataset_id\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mtest_dataset_id\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataset_tst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfeature_engineering\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mclient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artifact_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mname_id_or_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_dataset_id\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_tst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artifact_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mname_id_or_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_dataset_id\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_trainer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdataset_trn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel_type\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0macc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_evaluator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_trn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdataset_trn\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_tst\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdataset_tst\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_promoter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccuracy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0macc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "# let's have a look at training pipeline\n", + "%pycat pipelines/training.py" + ] + }, + { + "cell_type": "markdown", + "id": "88b70fd3", + "metadata": {}, + "source": [ + "The end goal of this quick baseline evaluation is to understand which of the two\n", + "models performs better. We'll use the `evaluator` step to compare the two\n", + "models. This step takes in the model from the trainer step, and computes its score\n", + "over the testing set.\n", + "\n", + "Soon you will see that it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n", + "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n", + "which is a central register of all your ML models.\n", + "\n", + "You can easily create a ZenML Model and associate it with your pipelines using the `Model` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19fe53b2", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model\"] = Model(\n", + " name=\"breast_cancer_classifier\",\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c64885ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the XGBoost model and tag the version name with \"xgboost\"\n", + "pipeline_settings[\"model\"].tags = [\"breast_cancer\", \"classifier\", \"xgboost\"]\n", + "\n", + "# Use an XGBoost model with fixed seed.\n", + "training.with_options(enable_cache=False,**pipeline_settings)(\n", + " model_type=\"xgboost\",\n", + " random_state=42\n", + ")\n", + "\n", + "xgboost_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4300c82f", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's train the SGD model and tag the version name with \"sgd\"\n", + "pipeline_settings[\"model\"].tags = [\"breast_cancer\", \"classifier\", \"sgd\"]\n", + "\n", + "# Use a SGD classifier\n", + "sgd_run = training.with_options(enable_cache=False,**pipeline_settings)(\n", + " model_type=\"sgd\",\n", + " random_state=42\n", + ")\n", + "\n", + "sgd_run = client.get_pipeline(\"training\").last_run" + ] + }, + { + "cell_type": "markdown", + "id": "43f1a68a", + "metadata": {}, + "source": [ + "You can see from the logs already how our model training went: the\n", + "`XGBClassifier` performed considerably better than the `SGDClassifier`.\n", + "We can use the ZenML `Client` to verify this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95810b1", + "metadata": {}, + "outputs": [], + "source": [ + "# The evaluator returns a float value with the accuracy\n", + "xgboost_run.steps[\"model_evaluator\"].output.load() >= sgd_run.steps[\"model_evaluator\"].output.load()" + ] + }, + { + "cell_type": "markdown", + "id": "09597223", + "metadata": {}, + "source": [ + "Running both pipelines has created two associated **model versions**.\n", + "You can list your ZenML model and their versions as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb25913", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_model = client.get_model(\"breast_cancer_classifier\")\n", + "print(zenml_model)\n", + "\n", + "versions = zenml_model.versions\n", + "\n", + "print(f\"Model {zenml_model.name} has {len(versions)} versions\")\n", + "\n", + "versions[-2].version, versions[-1].version" + ] + }, + { + "cell_type": "markdown", + "id": "e82cfac2", + "metadata": {}, + "source": [ + "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n", + "pipelines to that model version, including the two pickle files that represent our\n", + "SGD and RandomForest classifier. We can see all artifacts directly from the model\n", + "version object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31211413", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load the XGBoost version\n", + "xgboost_zenml_model_version = client.list_model_versions(\"breast_cancer_classifier\", tag=\"xgboost\")[-1]\n", + "\n", + "# We can now load our classifier directly as well\n", + "xgboost_classifier = xgboost_zenml_model_version.get_artifact(\"breast_cancer_classifier\").load()\n", + "\n", + "xgboost_classifier" + ] + }, + { + "cell_type": "markdown", + "id": "53517a9a", + "metadata": {}, + "source": [ + "If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "eb645dde", + "metadata": {}, + "source": [ + "There is a lot more you can do with ZenML models, including the ability to\n", + "track metrics by adding metadata to it, or having them persist in a model\n", + "registry. However, these topics can be explored more in the\n", + "[ZenML docs](https://docs.zenml.io).\n", + "\n", + "For now, we will use the ZenML model control plane to promote our best\n", + "model to `production`. You can do this by simply setting the `stage` of\n", + "your chosen model version to the `production` tag." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b718f8", + "metadata": {}, + "outputs": [], + "source": [ + "# Set our best classifier to production\n", + "xgboost_zenml_model_version.set_stage(\"production\", force=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9fddf3d0", + "metadata": {}, + "source": [ + "Of course, normally one would only promote the model by comparing to all other model\n", + "versions and doing some other tests. But that's a bit more advanced use-case. See the\n", + "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n", + "more insight into that sort of flow!" + ] + }, + { + "cell_type": "markdown", + "id": "2ecbc8cf", + "metadata": {}, + "source": [ + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "8f1146db", + "metadata": {}, + "source": [ + "Once the model is promoted, we can now consume the right model version in our\n", + "batch inference pipeline directly. Let's see how that works." + ] + }, + { + "cell_type": "markdown", + "id": "d6306f14", + "metadata": {}, + "source": [ + "# ๐Ÿซ… Step 2: Consuming the model in production" + ] + }, + { + "cell_type": "markdown", + "id": "b51f3108", + "metadata": {}, + "source": [ + "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", + "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", + "and generate predictions:\n", + "\n", + "\"Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "92c4c7dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31m# Apache Software License 2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Copyright (c) ZenML GmbH 2023. All rights reserved.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# you may not use this file except in compliance with the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# You may obtain a copy of the License at\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# http://www.apache.org/licenses/LICENSE-2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Unless required by applicable law or agreed to in writing, software\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# distributed under the License is distributed on an \"AS IS\" BASIS,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# See the License for the specific language governing permissions and\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# limitations under the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mtyping\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mtyping_extensions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0minference_predict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mAny\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_inf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"predictions\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"Predictions step.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m This is an example of a predictions step that takes the data and model in\u001b[0m\n", + "\u001b[0;34m and returns predicted values.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m This step is parameterized, which allows you to configure the step\u001b[0m\n", + "\u001b[0;34m independently of the step code, before running it in a pipeline.\u001b[0m\n", + "\u001b[0;34m In this example, the step can be configured to use different input data.\u001b[0m\n", + "\u001b[0;34m See the documentation for more information:\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Args:\u001b[0m\n", + "\u001b[0;34m model: Trained model.\u001b[0m\n", + "\u001b[0;34m dataset_inf: The inference dataset.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Returns:\u001b[0m\n", + "\u001b[0;34m The predictions as pandas series\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# run prediction from memory\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_inf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"predicted\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "# let's have a look at training pipeline\n", + "%pycat steps/inference_predict.py" + ] + }, + { + "cell_type": "markdown", + "id": "3aeb227b", + "metadata": {}, + "source": [ + "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n", + "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "37c409bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31m# Apache Software License 2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Copyright (c) ZenML GmbH 2024. All rights reserved.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# you may not use this file except in compliance with the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# You may obtain a copy of the License at\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# http://www.apache.org/licenses/LICENSE-2.0\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# Unless required by applicable law or agreed to in writing, software\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# distributed under the License is distributed on an \"AS IS\" BASIS,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# See the License for the specific language governing permissions and\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m# limitations under the License.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msteps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minference_predict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minference_preprocessor\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_pipeline_context\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0mlogger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_logger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0minference\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"\"\"\u001b[0m\n", + "\u001b[0;34m Model inference pipeline.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m This is a pipeline that loads the inference data, processes it with\u001b[0m\n", + "\u001b[0;34m the same preprocessing pipeline used in training, and runs inference\u001b[0m\n", + "\u001b[0;34m with the trained model.\u001b[0m\n", + "\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m Args:\u001b[0m\n", + "\u001b[0;34m random_state: Random state for reproducibility.\u001b[0m\n", + "\u001b[0;34m target: Name of target column in dataset.\u001b[0m\n", + "\u001b[0;34m \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Get the production model artifact\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pipeline_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artifact\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"breast_cancer_classifier\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Get the preprocess pipeline artifact associated with this version\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpreprocess_pipeline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pipeline_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artifact\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"preprocess_pipeline\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Link all the steps together by calling them and passing the output\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# of one step as the input of the next step.\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf_inference\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_inference\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf_inference\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minference_preprocessor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_inf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_inference\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpreprocess_pipeline\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpreprocess_pipeline\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minference_predict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_inf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_inference\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "# let's have a look at training pipeline\n", + "%pycat pipelines/inference.py" + ] + }, + { + "cell_type": "markdown", + "id": "c7afe7be", + "metadata": {}, + "source": [ + "The way to load the right model is to pass in the `production` stage into the `Model` config this time.\n", + "This will ensure to always load the production model, decoupled from all other pipelines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf5939", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_settings = {\"enable_cache\": False}\n", + "\n", + "# Lets add some metadata to the model to make it identifiable\n", + "pipeline_settings[\"model\"] = Model(\n", + " name=\"breast_cancer_classifier\",\n", + " version=\"production\", # We can pass in the stage name here!\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff3402f1", + "metadata": {}, + "outputs": [], + "source": [ + "# the `with_options` method allows us to pass in pipeline settings\n", + "# and returns a configured pipeline\n", + "inference.with_options(**pipeline_settings)()" + ] + }, + { + "cell_type": "markdown", + "id": "2935d1fa", + "metadata": {}, + "source": [ + "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n", + "that were returned in the pipeline. This completes the MLOps loop of training to inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e191d019", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch production model\n", + "production_model_version = client.get_model_version(\"breast_cancer_classifier\", \"production\")\n", + "\n", + "# Get the predictions artifact\n", + "production_model_version.get_artifact(\"predictions\").load()" + ] + }, + { + "cell_type": "markdown", + "id": "b0a73cdf", + "metadata": {}, + "source": [ + "You can also see all predictions ever created as a complete history in the dashboard:\n", + "\n", + "\"Model" + ] + }, + { + "cell_type": "markdown", + "id": "cdf22981", + "metadata": {}, + "source": [ + "# ๐Ÿ™ Step 3: Analyzing results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2728fa4d", + "metadata": {}, + "outputs": [], + "source": [ + "sgd_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"sgd\")[-1]\n", + "xgboost_model_version = client.list_model_versions(\"breast_cancer_classifier\",tag=\"xgboost\")[-1]\n", + "print(f\"SGD version is staged as `{sgd_model_version.stage}`\")\n", + "print(f\"XGBoost version is staged as `{xgboost_model_version.stage}`\")" + ] + }, + { + "cell_type": "markdown", + "id": "f5d36b23", + "metadata": {}, + "source": [ + "At first, let's pull some meta information collected during models evaluation stage. To recall we used this step as evaluator:\n", + "```python\n", + "@step\n", + "def model_evaluator(\n", + " model: ClassifierMixin,\n", + " dataset_trn: pd.DataFrame,\n", + " dataset_tst: pd.DataFrame,\n", + " min_train_accuracy: float = 0.0,\n", + " min_test_accuracy: float = 0.0,\n", + " target: Optional[str] = \"target\",\n", + ") -> float:\n", + " # Calculate the model accuracy on the train and test set\n", + " trn_acc = model.score(...)\n", + " tst_acc = model.score(...)\n", + "\n", + " ...\n", + " \n", + " predictions = model.predict(dataset_tst.drop(columns=[target]))\n", + " metadata = {\n", + " \"train_accuracy\": float(trn_acc),\n", + " \"test_accuracy\": float(tst_acc),\n", + " \"confusion_matrix\": confusion_matrix(dataset_tst[target], predictions)\n", + " .ravel()\n", + " .tolist(),\n", + " }\n", + " log_model_metadata(metadata={\"wandb_url\": wandb.run.url})\n", + " log_artifact_metadata(\n", + " metadata=metadata,\n", + " artifact_name=\"breast_cancer_classifier\",\n", + " )\n", + "\n", + " wandb.log({\"train_accuracy\": metadata[\"train_accuracy\"]})\n", + " wandb.log({\"test_accuracy\": metadata[\"test_accuracy\"]})\n", + " wandb.log(\n", + " {\n", + " \"confusion_matrix\": wandb.sklearn.plot_confusion_matrix(\n", + " dataset_tst[target], predictions, [\"No Cancer\", \"Cancer\"]\n", + " )\n", + " }\n", + " )\n", + " return float(tst_acc)\n", + "```\n", + "First we pull Accuracy metrics out of both model version for comparison:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce698de6", + "metadata": {}, + "outputs": [], + "source": [ + "sgd_clf_metadata = sgd_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n", + "xgboost_clf_metadata = xgboost_model_version.get_artifact(\"breast_cancer_classifier\").run_metadata\n", + "print(f\"SGD{' (production)' if sgd_model_version.stage == 'production' else ''} metrics: train={sgd_clf_metadata['train_accuracy'].value*100:.2f}% test={sgd_clf_metadata['test_accuracy'].value*100:.2f}%\")\n", + "print(f\"XGBoost{' (production)' if xgboost_model_version.stage == 'production' else ''} metrics: train={xgboost_clf_metadata['train_accuracy'].value*100:.2f}% test={xgboost_clf_metadata['test_accuracy'].value*100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "1ceb5256", + "metadata": {}, + "source": [ + "Now lets' plot collected Confusion Matrixes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d488910e", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_confusion_matrix(metadata_pointer, tp: str,ax):\n", + " confusion_matrix = np.array(metadata_pointer[\"confusion_matrix\"].value, dtype=float).reshape((2,2))\n", + " confusion_matrix /= np.sum(confusion_matrix)\n", + " sns.heatmap(confusion_matrix, annot=True,fmt='.2%',cmap=\"coolwarm\",ax=ax)\n", + " ax.set_title(f\"{tp} confusion matrix\")\n", + " ax.set_ylabel(\"Ground Label\")\n", + " ax.set_xlabel(\"Predicted Label\")\n", + "\n", + "fig, ax = plt.subplots(1,2,figsize=(15,4))\n", + "plot_confusion_matrix(sgd_clf_metadata, \"SGD\",ax[0])\n", + "plot_confusion_matrix(xgboost_clf_metadata, \"RF\",ax[1])" + ] + }, + { + "cell_type": "markdown", + "id": "fcb63aee", + "metadata": {}, + "source": [ + "So far we were able to collect all the information we tracked using Model Control Plane, but we also had Weights&Biases tracking enabled - let's dive into.\n", + "\n", + "Thanks to Model Control Plane metadata we establish a nice connection between those 2 entities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7097d4f", + "metadata": {}, + "outputs": [], + "source": [ + "print(f'SGD version: {sgd_model_version.run_metadata[\"wandb_url\"].value}')\n", + "print(f'XGBoost version: {xgboost_model_version.run_metadata[\"wandb_url\"].value}')" + ] + }, + { + "cell_type": "markdown", + "id": "5de78e7b", + "metadata": {}, + "source": [ + "With Model Control Plane we can also easily track lineage of artifacts and pipeline runs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8891b6", + "metadata": {}, + "outputs": [], + "source": [ + "for artifact_name, versions in sgd_model_version.data_artifacts.items():\n", + " if versions:\n", + " print(f\"Existing version of `{artifact_name}`:\")\n", + " for version_name, artifact_ in versions.items():\n", + " print(version_name, artifact_.data_type.attribute)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76d386dc", + "metadata": {}, + "outputs": [], + "source": [ + "for run_name, run_ in sgd_model_version.pipeline_runs.items():\n", + " print(run_name, run_.id)" + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "# ๐Ÿ™ Step 4: Moving to production" + ] + }, + { + "cell_type": "markdown", + "id": "dfe22780", + "metadata": {}, + "source": [ + "Let's move some heavy lifting to the Sagemaker. This can be achieved using Sagemaker orchestrator.\n", + "\n", + "\"Sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2774cbc", + "metadata": {}, + "outputs": [], + "source": [ + "!zenml stack set sagemaker-pipelines-wandb\n", + "!zenml stack describe sagemaker-pipelines-wandb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3e37542", + "metadata": {}, + "outputs": [], + "source": [ + "client.activate_stack(\"sagemaker-pipelines-wandb\")\n", + "\n", + "pipeline_settings = {}\n", + "\n", + "pipeline_settings[\"model\"] = Model(\n", + " name=\"breast_cancer_classifier\",\n", + " license=\"Apache 2.0\",\n", + " description=\"A breast cancer classifier\",\n", + " tags = [\"breast_cancer\", \"classifier\", \"xgboost\"]\n", + ")\n", + "\n", + "training.with_options(config_path=\"configs/training_xgboost.yaml\",**pipeline_settings)(\n", + " model_type=\"xgboost\",\n", + " random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d950f97e", + "metadata": {}, + "source": [ + "# ๐Ÿง™โ€โ™‚๏ธ Step 5: Deploy Sagemaker endpoint\n", + "\n", + "After training and promoting production-ready model version to `production` we can proceed deploying it as a Sagemaker Endpoint.\n", + "The deployment pipeline is following:\n", + "\n", + "\"Deployment\n", + "\n", + "First we will explore how deployment step is designed. Based on the origin of current model in `production` we need to adapt deployment action a bit. For `XGBoost` we can use Sagemaker standard image and model saved via standard XGBoost `save_model` model, but for `sklearn` this approach will not work properly, since there is no standard way of saving Sklearn models besides pickling them via various libraries, so we need a special instruction script describing how to load, predict and so on on our model." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "496adffa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;32mfrom\u001b[0m \u001b[0mtyping_extensions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mimport\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimage_uris\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mretrieve\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msagemaker\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPredictor\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mget_step_context\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mArtifactConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_artifact_metadata\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mdatetime\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maws\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mget_aws_config\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_materializer\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSagemakerPredictorMaterializer\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0menable_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0moutput_materializers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSagemakerPredictorMaterializer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mdeploy_endpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mAnnotated\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mPredictor\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mArtifactConfig\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"sagemaker_endpoint\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_deployment_artifact\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrole\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mregion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_aws_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_step_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_model_version\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"sgd\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtags\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mimage_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mregion\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mregion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mframework\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"sklearn\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mversion\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"1.0-1\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mentry_point\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"utils/sklearn_inference.py\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mimage_uri\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mregion\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mregion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mframework\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"xgboost\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mversion\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"1.5-1\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mentry_point\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34mf'{model.get_artifact(\"breast_cancer_classifier\").uri}/model.tar.gz'\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mendpoint_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf'breast-cancer-classifier-{datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S-%f\")}'\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mimage_uri\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mimage_uri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mmodel_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel_data\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msagemaker_session\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrole\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrole\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mentry_point\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mentry_point\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeploy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minitial_instance_count\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minstance_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ml.m5.large\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mendpoint_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendpoint_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlog_artifact_metadata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"endpoint_name\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mendpoint_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"image_uri\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mimage_uri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"role_arn\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrole\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mPredictor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendpoint_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendpoint_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "%pycat steps/deploy_endpoint.py" + ] + }, + { + "cell_type": "markdown", + "id": "3c60b684", + "metadata": {}, + "source": [ + "The full deployment pipeline code is the following.\n", + "\n", + "NOTE: we deprovision model at the end of the pipeline to save cost, but this step will not be there in production setting." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9dfb5642", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;32mfrom\u001b[0m \u001b[0mzenml\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mget_pipeline_context\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msteps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minference_preprocessor\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mimport\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mfrom\u001b[0m \u001b[0msteps\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdeploy_endpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredict_on_endpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshutdown_endpoint\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m@\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mdeploy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshutdown_endpoint_after_predicting\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;31m# Get the preprocess pipeline artifact associated with this version\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpreprocess_pipeline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_pipeline_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artifact\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m\"preprocess_pipeline\"\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf_inference\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_inference\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdf_inference\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minference_preprocessor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_inf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf_inference\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpreprocess_pipeline\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpreprocess_pipeline\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"target\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredictor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdeploy_endpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredict_on_endpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf_inference\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshutdown_endpoint_after_predicting\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mshutdown_endpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mafter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"predict_on_endpoint\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "%pycat pipelines/deploy.py" + ] + }, + { + "cell_type": "markdown", + "id": "5ba94464", + "metadata": {}, + "source": [ + "Ok, now we can deploy our model and explore predictions!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aff4c929", + "metadata": {}, + "outputs": [], + "source": [ + "from pipelines import deploy\n", + "\n", + "client.activate_stack(\"local-sagemaker-step-operator-wandb\")\n", + "\n", + "deploy.with_options(\n", + " model=Model(name=\"breast_cancer_classifier\", version=\"production\")\n", + ")(shutdown_endpoint_after_predicting=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41cec4dc", + "metadata": {}, + "outputs": [], + "source": [ + "# explore created endpoint\n", + "run_metadata = client.get_model_version(\"breast_cancer_classifier\", \"production\").get_artifact(\"sagemaker_endpoint\").run_metadata\n", + "for k,v in run_metadata.items():\n", + " print(k, v.value)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9e5fc22", + "metadata": {}, + "outputs": [], + "source": [ + "# explore real time predictions\n", + "client.get_model_version(\"breast_cancer_classifier\", \"production\").get_artifact(\"real_time_predictions\").load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/stack-showcase/steps/__init__.py b/classifier-e2e/steps/__init__.py similarity index 81% rename from stack-showcase/steps/__init__.py rename to classifier-e2e/steps/__init__.py index 4db083c1..b17987b9 100644 --- a/stack-showcase/steps/__init__.py +++ b/classifier-e2e/steps/__init__.py @@ -24,6 +24,5 @@ from .model_promoter import ( model_promoter, ) -from .deploy_to_huggingface import ( - deploy_to_huggingface, -) \ No newline at end of file +from .deploy_endpoint import deploy_endpoint +from .misc_endpoint import predict_on_endpoint, shutdown_endpoint \ No newline at end of file diff --git a/stack-showcase/steps/data_loader.py b/classifier-e2e/steps/data_loader.py similarity index 67% rename from stack-showcase/steps/data_loader.py rename to classifier-e2e/steps/data_loader.py index cc6df580..1934baa8 100644 --- a/stack-showcase/steps/data_loader.py +++ b/classifier-e2e/steps/data_loader.py @@ -1,9 +1,25 @@ -# {% include 'template/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import pandas as pd from sklearn.datasets import load_breast_cancer from typing_extensions import Annotated -from zenml import log_artifact_metadata, step + +from zenml import step from zenml.logger import get_logger logger = get_logger(__name__) @@ -25,19 +41,20 @@ def data_loader( https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines Args: + random_state: Random state for sampling is_inference: If `True` subset will be returned and target column will be removed from dataset. - random_state: Random state for sampling target: Name of target columns in dataset. Returns: The dataset artifact as Pandas DataFrame and name of target column. """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### dataset = load_breast_cancer(as_frame=True) inference_size = int(len(dataset.target) * 0.05) dataset: pd.DataFrame = dataset.frame - inference_subset = dataset.sample(inference_size, random_state=random_state) + inference_subset = dataset.sample( + inference_size, random_state=random_state + ) if is_inference: dataset = inference_subset dataset.drop(columns=target, inplace=True) @@ -45,9 +62,4 @@ def data_loader( dataset.drop(inference_subset.index, inplace=True) dataset.reset_index(drop=True, inplace=True) logger.info(f"Dataset with {len(dataset)} records loaded!") - - # Recording metadata for this dataset - log_artifact_metadata(metadata={"random_state": random_state, target: target}) - - ### YOUR CODE ENDS HERE ### return dataset diff --git a/stack-showcase/steps/data_preprocessor.py b/classifier-e2e/steps/data_preprocessor.py similarity index 61% rename from stack-showcase/steps/data_preprocessor.py rename to classifier-e2e/steps/data_preprocessor.py index 961f1468..1bf99025 100644 --- a/stack-showcase/steps/data_preprocessor.py +++ b/classifier-e2e/steps/data_preprocessor.py @@ -1,54 +1,34 @@ -# {% include 'template/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -from typing import Union -import pandas as pd from typing import List, Optional, Tuple import pandas as pd from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler from typing_extensions import Annotated -from zenml import log_artifact_metadata, step - - -class NADropper: - """Support class to drop NA values in sklearn Pipeline.""" - - def fit(self, *args, **kwargs): - return self - - def transform(self, X: Union[pd.DataFrame, pd.Series]): - return X.dropna() - - -class ColumnsDropper: - """Support class to drop specific columns in sklearn Pipeline.""" - - def __init__(self, columns): - self.columns = columns - - def fit(self, *args, **kwargs): - return self +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper - def transform(self, X: Union[pd.DataFrame, pd.Series]): - return X.drop(columns=self.columns) - - -class DataFrameCaster: - """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" - - def __init__(self, columns): - self.columns = columns - - def fit(self, *args, **kwargs): - return self - - def transform(self, X): - return pd.DataFrame(X, columns=self.columns) +from zenml import log_artifact_metadata, step @step def data_preprocessor( + random_state: int, dataset_trn: pd.DataFrame, dataset_tst: pd.DataFrame, drop_na: Optional[bool] = None, @@ -66,7 +46,7 @@ def data_preprocessor( it is suitable for model training. It takes in a dataset as an input step artifact and performs any necessary preprocessing steps like cleaning, feature engineering, feature selection, etc. It then returns the processed - dataset as an step output artifact. + dataset as a step output artifact. This step is parameterized, which allows you to configure the step independently of the step code, before running it in a pipeline. @@ -77,39 +57,38 @@ def data_preprocessor( https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines Args: + random_state: Random state for sampling. dataset_trn: The train dataset. dataset_tst: The test dataset. drop_na: If `True` all NA rows will be dropped. normalize: If `True` all numeric fields will be normalized. drop_columns: List of column names to drop. + target: Name of target column in dataset. Returns: The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### # We use the sklearn pipeline to chain together multiple preprocessing steps preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) if drop_na: preprocess_pipeline.steps.append(("drop_na", NADropper())) if drop_columns: # Drop columns - preprocess_pipeline.steps.append(("drop_columns", ColumnsDropper(drop_columns))) + preprocess_pipeline.steps.append( + ("drop_columns", ColumnsDropper(drop_columns)) + ) if normalize: # Normalize the data preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) - preprocess_pipeline.steps.append(("cast", DataFrameCaster(dataset_trn.columns))) + preprocess_pipeline.steps.append( + ("cast", DataFrameCaster(dataset_trn.columns)) + ) dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) dataset_tst = preprocess_pipeline.transform(dataset_tst) - # Log metadata of target to both datasets + # Log metadata so we can load it in the inference pipeline log_artifact_metadata( - artifact_name="dataset_trn", - metadata={"target": target}, + artifact_name="preprocess_pipeline", + metadata={"random_state": random_state, "target": target}, ) - log_artifact_metadata( - artifact_name="dataset_tst", - metadata={"target": target}, - ) - - ### YOUR CODE ENDS HERE ### return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/stack-showcase/steps/data_splitter.py b/classifier-e2e/steps/data_splitter.py similarity index 68% rename from stack-showcase/steps/data_splitter.py rename to classifier-e2e/steps/data_splitter.py index dbab8157..bb0e9bd2 100644 --- a/stack-showcase/steps/data_splitter.py +++ b/classifier-e2e/steps/data_splitter.py @@ -1,10 +1,26 @@ -# {% include 'template/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# from typing import Tuple import pandas as pd from sklearn.model_selection import train_test_split from typing_extensions import Annotated + from zenml import step @@ -34,7 +50,6 @@ def data_splitter( Returns: The split dataset: dataset_trn, dataset_tst. """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### dataset_trn, dataset_tst = train_test_split( dataset, test_size=test_size, @@ -43,5 +58,4 @@ def data_splitter( ) dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) - ### YOUR CODE ENDS HERE ### return dataset_trn, dataset_tst diff --git a/classifier-e2e/steps/deploy_endpoint.py b/classifier-e2e/steps/deploy_endpoint.py new file mode 100644 index 00000000..7f5ad977 --- /dev/null +++ b/classifier-e2e/steps/deploy_endpoint.py @@ -0,0 +1,61 @@ +from typing_extensions import Annotated + +import sagemaker +from sagemaker.image_uris import retrieve +from sagemaker import Predictor + +from zenml import step, get_step_context, ArtifactConfig, log_artifact_metadata +from datetime import datetime + +from utils.aws import get_aws_config +from utils.sagemaker_materializer import SagemakerPredictorMaterializer + + +@step( + enable_cache=False, + output_materializers=[SagemakerPredictorMaterializer], +) +def deploy_endpoint() -> Annotated[ + Predictor, + ArtifactConfig(name="sagemaker_endpoint", is_deployment_artifact=True), +]: + role, session, region = get_aws_config() + + model = get_step_context().model._get_model_version() + if "sgd" in {t.name for t in model.tags}: + image_uri = retrieve( + region=region, framework="sklearn", version="1.0-1" + ) + entry_point = "utils/sklearn_inference.py" + else: + image_uri = retrieve( + region=region, framework="xgboost", version="1.5-1" + ) + entry_point = None + + model_data = ( + f'{model.get_artifact("breast_cancer_classifier").uri}/model.tar.gz' + ) + + endpoint_name = f'breast-cancer-classifier-{datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")}' + sagemaker.Model( + image_uri=image_uri, + model_data=model_data, + sagemaker_session=session, + role=role, + entry_point=entry_point, + ).deploy( + initial_instance_count=1, + instance_type="ml.m5.large", + endpoint_name=endpoint_name, + ) + + log_artifact_metadata( + { + "endpoint_name": endpoint_name, + "image_uri": image_uri, + "role_arn": role, + } + ) + + return Predictor(endpoint_name=endpoint_name) diff --git a/stack-showcase/steps/inference_predict.py b/classifier-e2e/steps/inference_predict.py similarity index 82% rename from stack-showcase/steps/inference_predict.py rename to classifier-e2e/steps/inference_predict.py index 8d74cc80..cd1d2921 100644 --- a/stack-showcase/steps/inference_predict.py +++ b/classifier-e2e/steps/inference_predict.py @@ -15,10 +15,12 @@ # limitations under the License. # +from typing import Any import pandas as pd from typing_extensions import Annotated -from zenml import get_step_context, step + +from zenml import step from zenml.logger import get_logger logger = get_logger(__name__) @@ -26,12 +28,13 @@ @step def inference_predict( + model: Any, dataset_inf: pd.DataFrame, ) -> Annotated[pd.Series, "predictions"]: """Predictions step. - This is an example of a predictions step that takes the data in and returns - predicted values. + This is an example of a predictions step that takes the data and model in + and returns predicted values. This step is parameterized, which allows you to configure the step independently of the step code, before running it in a pipeline. @@ -41,19 +44,14 @@ def inference_predict( https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines Args: + model: Trained model. dataset_inf: The inference dataset. Returns: The predictions as pandas series """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - zenml_model = get_step_context().model - # run prediction from memory - predictor = zenml_model.load_artifact("model") - predictions = predictor.predict(dataset_inf) + predictions = model.predict(dataset_inf) predictions = pd.Series(predictions, name="predicted") - ### YOUR CODE ENDS HERE ### - return predictions diff --git a/stack-showcase/steps/inference_preprocessor.py b/classifier-e2e/steps/inference_preprocessor.py similarity index 91% rename from stack-showcase/steps/inference_preprocessor.py rename to classifier-e2e/steps/inference_preprocessor.py index c7a5ae1c..d484433e 100644 --- a/stack-showcase/steps/inference_preprocessor.py +++ b/classifier-e2e/steps/inference_preprocessor.py @@ -18,6 +18,7 @@ import pandas as pd from sklearn.pipeline import Pipeline from typing_extensions import Annotated + from zenml import step @@ -42,11 +43,8 @@ def inference_preprocessor( Returns: The processed dataframe: dataset_inf. """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### # artificially adding `target` column to avoid Pipeline issues dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) dataset_inf = preprocess_pipeline.transform(dataset_inf) - dataset_inf.drop(columns=["target"], inplace=True) - ### YOUR CODE ENDS HERE ### - + dataset_inf.drop(columns=[target], inplace=True) return dataset_inf diff --git a/classifier-e2e/steps/misc_endpoint.py b/classifier-e2e/steps/misc_endpoint.py new file mode 100644 index 00000000..aa6c7c8d --- /dev/null +++ b/classifier-e2e/steps/misc_endpoint.py @@ -0,0 +1,25 @@ +from typing_extensions import Annotated + + +from zenml import step +from sagemaker.predictor import Predictor +import pandas as pd + + +@step +def predict_on_endpoint( + predictor: Predictor, dataset: pd.DataFrame +) -> Annotated[pd.Series, "real_time_predictions"]: + predictions = predictor.predict( + data=dataset.to_csv(header=False, index=False), + initial_args={"ContentType": "text/csv"}, + ) + return pd.Series( + [float(l) for l in predictions.decode().split("\n") if l], + name="predictions", + ) + + +@step +def shutdown_endpoint(predictor: Predictor): + predictor.delete_endpoint() diff --git a/stack-showcase/steps/model_evaluator.py b/classifier-e2e/steps/model_evaluator.py similarity index 59% rename from stack-showcase/steps/model_evaluator.py rename to classifier-e2e/steps/model_evaluator.py index f9e72fc3..db15b689 100644 --- a/stack-showcase/steps/model_evaluator.py +++ b/classifier-e2e/steps/model_evaluator.py @@ -1,24 +1,51 @@ -# {% include 'template/license_header' %} +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional import pandas as pd -import mlflow from sklearn.base import ClassifierMixin -from zenml import step, log_artifact_metadata -from zenml.client import Client +from sklearn.metrics import confusion_matrix + +from zenml import ( + log_artifact_metadata, + step, + log_model_metadata, + get_step_context, +) from zenml.logger import get_logger -from zenml import get_step_context +import wandb +from zenml.client import Client +from zenml.exceptions import StepContextError + logger = get_logger(__name__) -experiment_tracker = Client().active_stack.experiment_tracker +et = Client().active_stack.experiment_tracker -@step(enable_cache=False, experiment_tracker="mlflow") + +@step(experiment_tracker=et.name) def model_evaluator( model: ClassifierMixin, dataset_trn: pd.DataFrame, dataset_tst: pd.DataFrame, min_train_accuracy: float = 0.0, min_test_accuracy: float = 0.0, + target: Optional[str] = "target", ) -> float: """Evaluate a trained model. @@ -50,29 +77,21 @@ def model_evaluator( dataset_tst: The test dataset. min_train_accuracy: Minimal acceptable training accuracy value. min_test_accuracy: Minimal acceptable testing accuracy value. - fail_on_accuracy_quality_gates: If `True` a `RuntimeException` is raised - upon not meeting one of the minimal accuracy thresholds. + target: Name of target column in dataset. Returns: The model accuracy on the test set. - - Raises: - RuntimeError: if any of accuracies is lower than respective threshold """ - # context = get_step_context() - # target = context.inputs["dataset_trn"].run_metadata['target'].value - target = "target" - # Calculate the model accuracy on the train and test set trn_acc = model.score( dataset_trn.drop(columns=[target]), dataset_trn[target], ) - logger.info(f"Train accuracy={trn_acc*100:.2f}%") tst_acc = model.score( dataset_tst.drop(columns=[target]), dataset_tst[target], ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") logger.info(f"Test accuracy={tst_acc*100:.2f}%") messages = [] @@ -88,15 +107,34 @@ def model_evaluator( for message in messages: logger.warning(message) - artifact = get_step_context().model.get_artifact("model") + predictions = model.predict(dataset_tst.drop(columns=[target])) + metadata = { + "train_accuracy": float(trn_acc), + "test_accuracy": float(tst_acc), + "confusion_matrix": confusion_matrix(dataset_tst[target], predictions) + .ravel() + .tolist(), + } + try: + if get_step_context().model: + log_model_metadata(metadata={"wandb_url": wandb.run.url}) + except StepContextError: + # if model not configured not able to log metadata + pass log_artifact_metadata( - metadata={"train_accuracy": float(trn_acc), "test_accuracy": float(tst_acc)}, - artifact_name=artifact.name, - artifact_version=artifact.version, + metadata=metadata, + artifact_name="breast_cancer_classifier", ) - mlflow.log_metric("train_accuracy", float(trn_acc)) - mlflow.log_metric("test_accuracy", float(tst_acc)) + wandb.log( + { + "confusion_matrix": wandb.sklearn.plot_confusion_matrix( + dataset_tst[target], predictions, ["No Cancer", "Cancer"] + ) + } + ) + wandb.log({"train_accuracy": metadata["train_accuracy"]}) + wandb.log({"test_accuracy": metadata["test_accuracy"]}) - return float(trn_acc) + return float(tst_acc) diff --git a/classifier-e2e/steps/model_promoter.py b/classifier-e2e/steps/model_promoter.py new file mode 100644 index 00000000..aac7083f --- /dev/null +++ b/classifier-e2e/steps/model_promoter.py @@ -0,0 +1,85 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger +from sklearn.metrics import accuracy_score + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Model promoter step. + + This is an example of a step that conditionally promotes a model. It takes + in the accuracy of the model and the stage to promote the model to. If the + accuracy is below 80%, the model is not promoted. If it is above 80%, the + model is promoted to the stage indicated in the parameters. If there is + already a model in the indicated stage, the model with the higher accuracy + is promoted. + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + is_promoted = False + + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + else: + is_promoted = True + + # Get the model in the current context + current_model = get_step_context().model + + # Get the model that is in the production stage + client = Client() + try: + stage_model = client.get_model_version(current_model.name, stage) + # We compare their metrics + prod_classifier = stage_model.get_artifact("breast_cancer_classifier") + if prod_classifier: + # and recompute metrics for current prod model using current test set + prod_classifier = prod_classifier.load() + current_dataset = current_model.get_artifact( + "dataset_tst" + ).load() + prod_accuracy = accuracy_score( + current_dataset["target"], + prod_classifier.predict( + current_dataset.drop(columns="target"), + ), + ) + else: + prod_accuracy = 0 + if float(accuracy) > float(prod_accuracy): + # If current model has better metrics, we promote it + logger.info(f"Model promoted to {stage}!") + is_promoted = True + current_model.set_stage(stage, force=True) + except KeyError: + # If no such model exists, current one is promoted + is_promoted = True + current_model.set_stage(stage, force=True) + return is_promoted diff --git a/classifier-e2e/steps/model_trainer.py b/classifier-e2e/steps/model_trainer.py new file mode 100644 index 00000000..3b84362a --- /dev/null +++ b/classifier-e2e/steps/model_trainer.py @@ -0,0 +1,75 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated + +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +from utils.sagemaker_materializer import SagemakerMaterializer + +logger = get_logger(__name__) + +@step(output_materializers=[SagemakerMaterializer,]) +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", + target: Optional[str] = "target", +) -> Annotated[ + ClassifierMixin, + ArtifactConfig(name="breast_cancer_classifier", is_model_artifact=True), +]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + model_type: The type of model to train. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + + Raises: + ValueError: If the model type is not supported. + """ + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "xgboost": + from xgboost import XGBClassifier + + model = XGBClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model diff --git a/classifier-e2e/utils/__init__.py b/classifier-e2e/utils/__init__.py new file mode 100644 index 00000000..8d4e9614 --- /dev/null +++ b/classifier-e2e/utils/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/classifier-e2e/utils/aws.py b/classifier-e2e/utils/aws.py new file mode 100644 index 00000000..c74bb410 --- /dev/null +++ b/classifier-e2e/utils/aws.py @@ -0,0 +1,23 @@ +import os +from typing import Tuple, Any + +import boto3 +import sagemaker + + +def get_aws_config() -> Tuple[Any, sagemaker.Session, str]: + REGION_NAME = os.getenv("AWS_REGION", "us-east-1") + ROLE_NAME = os.getenv("AWS_ROLE_NAME", "hamza_connector") + os.environ["AWS_DEFAULT_REGION"] = REGION_NAME + + auth_arguments = { + "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID", None), + "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY", None), + "aws_session_token": os.getenv("AWS_SESSION_TOKEN", None), + "region_name": REGION_NAME, + } + iam = boto3.client("iam", **auth_arguments) + role = iam.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] + session = sagemaker.Session(boto3.Session(**auth_arguments)) + + return role, session, REGION_NAME diff --git a/classifier-e2e/utils/preprocess.py b/classifier-e2e/utils/preprocess.py new file mode 100644 index 00000000..df60bce3 --- /dev/null +++ b/classifier-e2e/utils/preprocess.py @@ -0,0 +1,56 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns) diff --git a/classifier-e2e/utils/sagemaker_materializer.py b/classifier-e2e/utils/sagemaker_materializer.py new file mode 100644 index 00000000..aa886af9 --- /dev/null +++ b/classifier-e2e/utils/sagemaker_materializer.py @@ -0,0 +1,88 @@ +import os +from typing import Type, Union + +from zenml.enums import ArtifactType +from zenml.io import fileio +from zenml.materializers.base_materializer import BaseMaterializer +from zenml.materializers.built_in_materializer import BuiltInMaterializer +from sklearn.linear_model import SGDClassifier +from xgboost import XGBClassifier +import tarfile +import tempfile +import joblib +from sklearn.base import ClassifierMixin +from sagemaker import Predictor + +class SagemakerMaterializer(BaseMaterializer): + ASSOCIATED_TYPES = (ClassifierMixin,) + ASSOCIATED_ARTIFACT_TYPE = ArtifactType.DATA + + def load( + self, data_type: Type[ClassifierMixin] + ) -> Union[SGDClassifier, XGBClassifier]: + """Read from artifact store.""" + fileio.copy( + os.path.join(self.uri, "model.tar.gz"), + os.path.join(tempfile.gettempdir(), "model.tar.gz"), + overwrite=True, + ) + est = None + with tarfile.open( + os.path.join(tempfile.gettempdir(), "model.tar.gz"), "r:gz" + ) as tar: + for member in tar.getmembers(): + tar.extract(member.name, tempfile.gettempdir()) + if member.name == "sklearn-model": + est = joblib.load( + os.path.join(tempfile.gettempdir(), "sklearn-model"), + ) + if member.name == "xgboost-model": + est = XGBClassifier() + est.load_model( + os.path.join(tempfile.gettempdir(), "xgboost-model") + ) + fileio.remove(os.path.join(tempfile.gettempdir(), member.name)) + if est: + break + if est is None: + raise RuntimeError( + "Failed to load estimator via SagemakerMaterializer..." + ) + return est + + def save(self, my_obj: ClassifierMixin) -> None: + """Write to artifact store.""" + with tarfile.open( + os.path.join(tempfile.gettempdir(), "model.tar.gz"), "w:gz" + ) as tar: + is_xgboost = isinstance(my_obj, XGBClassifier) + file_name = ("xgboost" if is_xgboost else "sklearn") + "-model" + tmp_ = os.path.join(tempfile.gettempdir(), file_name) + if is_xgboost: + # if model supports saving - use it over joblib + my_obj.save_model(tmp_) + else: + joblib.dump(my_obj, tmp_) + tar.add(tmp_, arcname=file_name) + fileio.remove(tmp_) + fileio.copy( + os.path.join(tempfile.gettempdir(), "model.tar.gz"), + os.path.join(self.uri, "model.tar.gz"), + overwrite=True, + ) + fileio.remove(os.path.join(tempfile.gettempdir(), "model.tar.gz")) + + +class SagemakerPredictorMaterializer(BaseMaterializer): + ASSOCIATED_TYPES = (Predictor,) + ASSOCIATED_ARTIFACT_TYPE = ArtifactType.SERVICE + + def load( + self, data_type: Type[Predictor] + ) -> Predictor: + """Read from artifact store.""" + return Predictor(endpoint_name=BuiltInMaterializer(self.uri).load(str)) + + def save(self, my_obj: Predictor) -> None: + """Write to artifact store.""" + BuiltInMaterializer(self.uri).save(my_obj.endpoint_name) diff --git a/classifier-e2e/utils/sklearn_inference.py b/classifier-e2e/utils/sklearn_inference.py new file mode 100644 index 00000000..bf7a8d2c --- /dev/null +++ b/classifier-e2e/utils/sklearn_inference.py @@ -0,0 +1,41 @@ +import joblib +import os +import pandas as pd +from io import StringIO + +""" +Deserialize fitted model +""" +def model_fn(model_dir): + model = joblib.load(os.path.join(model_dir, "sklearn-model")) + return model + +""" +input_fn + request_body: The body of the request sent to the model. + request_content_type: (string) specifies the format/variable type of the request +""" +def input_fn(request_body, request_content_type): + if request_content_type == 'text/csv': + request = pd.read_csv(StringIO(request_body)) + return request + else: + raise ValueError("This model only supports text/csv input") + +""" +predict_fn + input_data: returned array from input_fn above + model (sklearn model) returned model loaded from model_fn above +""" +def predict_fn(input_data, model): + return model.predict(input_data) + +""" +output_fn + prediction: the returned value from predict_fn above + content_type: the content type the endpoint expects to be returned. Ex: JSON, string + +""" + +def output_fn(prediction, content_type): + return pd.Series(prediction).to_csv(index=False,header=False) \ No newline at end of file diff --git a/stack-showcase/Dockerfile b/stack-showcase/Dockerfile deleted file mode 100644 index 0f09cfbc..00000000 --- a/stack-showcase/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker -# you will also find guides on how best to write your Dockerfile - -FROM python:3.9 - -WORKDIR /code - -COPY ./requirements.txt /code/requirements.txt - -RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt - -# Set up a new user named "user" with user ID 1000 -RUN useradd -m -u 1000 user -# Switch to the "user" user -USER user -# Set home to the user's home directory -ENV HOME=/home/user \ - PATH=/home/user/.local/bin:$PATH - -# Set the working directory to the user's home directory -WORKDIR $HOME/app - -# Copy the current directory contents into the container at $HOME/app setting the owner to the user -COPY --chown=user . $HOME/app - -CMD ["python", "app.py", "--server.port=7860", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/stack-showcase/_assets/airflow_stack.png b/stack-showcase/_assets/airflow_stack.png deleted file mode 100644 index e4b29a78..00000000 Binary files a/stack-showcase/_assets/airflow_stack.png and /dev/null differ diff --git a/stack-showcase/_assets/default_stack.png b/stack-showcase/_assets/default_stack.png deleted file mode 100644 index 07ba8907..00000000 Binary files a/stack-showcase/_assets/default_stack.png and /dev/null differ diff --git a/stack-showcase/_assets/deployment_pipeline.png b/stack-showcase/_assets/deployment_pipeline.png deleted file mode 100644 index e9cf495d..00000000 Binary files a/stack-showcase/_assets/deployment_pipeline.png and /dev/null differ diff --git a/stack-showcase/_assets/feature_engineering_pipeline.png b/stack-showcase/_assets/feature_engineering_pipeline.png deleted file mode 100644 index db301913..00000000 Binary files a/stack-showcase/_assets/feature_engineering_pipeline.png and /dev/null differ diff --git a/stack-showcase/_assets/inference_pipeline.png b/stack-showcase/_assets/inference_pipeline.png deleted file mode 100644 index 358d5537..00000000 Binary files a/stack-showcase/_assets/inference_pipeline.png and /dev/null differ diff --git a/stack-showcase/_assets/inference_pipeline.png:Zone.Identifier b/stack-showcase/_assets/inference_pipeline.png:Zone.Identifier deleted file mode 100644 index e69de29b..00000000 diff --git a/stack-showcase/_assets/local_sagmaker_so_stack.png b/stack-showcase/_assets/local_sagmaker_so_stack.png deleted file mode 100644 index f9762766..00000000 Binary files a/stack-showcase/_assets/local_sagmaker_so_stack.png and /dev/null differ diff --git a/stack-showcase/_assets/pipeline_overview.png b/stack-showcase/_assets/pipeline_overview.png deleted file mode 100644 index 609e97d2..00000000 Binary files a/stack-showcase/_assets/pipeline_overview.png and /dev/null differ diff --git a/stack-showcase/_assets/sagemaker_stack.png b/stack-showcase/_assets/sagemaker_stack.png deleted file mode 100644 index 985f510c..00000000 Binary files a/stack-showcase/_assets/sagemaker_stack.png and /dev/null differ diff --git a/stack-showcase/_assets/training_pipeline.png b/stack-showcase/_assets/training_pipeline.png deleted file mode 100644 index a2e6a7d0..00000000 Binary files a/stack-showcase/_assets/training_pipeline.png and /dev/null differ diff --git a/stack-showcase/app.py b/stack-showcase/app.py deleted file mode 100644 index cdd2d6aa..00000000 --- a/stack-showcase/app.py +++ /dev/null @@ -1,56 +0,0 @@ -import gradio as gr -import numpy as np -import pandas as pd -from sklearn.datasets import load_breast_cancer -from zenml.client import Client -import os - -ZENML_STORE_API_KEY = os.getenv("ZENML_STORE_API_KEY", None) -ZENML_STORE_URL = os.getenv("ZENML_STORE_URL", None) - -if ZENML_STORE_API_KEY: - # Use os.process to call zenml connect --url ZENML_STORE_URL --api-key ZENML_STORE_API_KEY - os.system(f"zenml connect --url {ZENML_STORE_URL} --api-key {ZENML_STORE_API_KEY}") - -client = Client() -zenml_model = client.get_model_version("breast_cancer_classifier", "production") -preprocess_pipeline = zenml_model.get_artifact("preprocess_pipeline").load() - -# Load the model -clf = zenml_model.get_artifact("model").load() - -# Load dataset to get feature names -data = load_breast_cancer() -feature_names = data.feature_names - -def classify(*input_features): - # Convert the input features to pandas DataFrame - input_features = np.array(input_features).reshape(1, -1) - input_df = pd.DataFrame(input_features, columns=feature_names) - - # Pre-process the DataFrame - input_df["target"] = pd.Series([1] * input_df.shape[0]) - input_df = preprocess_pipeline.transform(input_df) - input_df.drop(columns=["target"], inplace=True) - - # Make a prediction - prediction_proba = clf.predict_proba(input_df)[0] - - # Map predicted class probabilities - classes = data.target_names - return {classes[idx]: prob for idx, prob in enumerate(prediction_proba)} - -# Define a list of Number inputs for each feature -input_components = [gr.Number(label=feature_name, default=0) for feature_name in feature_names] - -# Define the Gradio interface -iface = gr.Interface( - fn=classify, - inputs=input_components, - outputs=gr.Label(num_top_classes=2), - title="Breast Cancer Classifier", - description="Enter the required measurements to predict the classification for breast cancer." -) - -# Launch the Gradio app -iface.launch() \ No newline at end of file diff --git a/stack-showcase/configs/deployment.yaml b/stack-showcase/configs/deployment.yaml deleted file mode 100644 index 3fa9165a..00000000 --- a/stack-showcase/configs/deployment.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# environment configuration -settings: - docker: - required_integrations: - - sklearn - -# configuration of the Model Control Plane -model: - name: breast_cancer_classifier - version: production - license: Apache 2.0 - description: Classification of Breast Cancer Dataset. - tags: ["classification", "sklearn"] diff --git a/stack-showcase/configs/feature_engineering.yaml b/stack-showcase/configs/feature_engineering.yaml deleted file mode 100644 index c584be48..00000000 --- a/stack-showcase/configs/feature_engineering.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# environment configuration -settings: - docker: - required_integrations: - - sklearn - -# configuration of the Model Control Plane -model: - name: breast_cancer_classifier - license: Apache 2.0 - description: Classification of Breast Cancer Dataset. - tags: ["classification", "sklearn"] diff --git a/stack-showcase/configs/inference.yaml b/stack-showcase/configs/inference.yaml deleted file mode 100644 index 3fa9165a..00000000 --- a/stack-showcase/configs/inference.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# environment configuration -settings: - docker: - required_integrations: - - sklearn - -# configuration of the Model Control Plane -model: - name: breast_cancer_classifier - version: production - license: Apache 2.0 - description: Classification of Breast Cancer Dataset. - tags: ["classification", "sklearn"] diff --git a/stack-showcase/flagged/log.csv b/stack-showcase/flagged/log.csv deleted file mode 100644 index 95b5cb76..00000000 --- a/stack-showcase/flagged/log.csv +++ /dev/null @@ -1,2 +0,0 @@ -mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,output,flag,username,timestamp -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,/home/htahir1/workspace/zenml_io/zenml-projects/stack-showcase/flagged/output/tmpjy2eamkw.json,,,2024-01-04 14:08:33.097778 diff --git a/stack-showcase/flagged/output/tmpjy2eamkw.json b/stack-showcase/flagged/output/tmpjy2eamkw.json deleted file mode 100644 index 9e26dfee..00000000 --- a/stack-showcase/flagged/output/tmpjy2eamkw.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/stack-showcase/pipelines/__init__.py b/stack-showcase/pipelines/__init__.py deleted file mode 100644 index 12c05849..00000000 --- a/stack-showcase/pipelines/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# {% include 'template/license_header' %} - -from .feature_engineering import feature_engineering -from .inference import inference -from .training import breast_cancer_training -from .deployment import breast_cancer_deployment_pipeline \ No newline at end of file diff --git a/stack-showcase/pipelines/deployment.py b/stack-showcase/pipelines/deployment.py deleted file mode 100644 index 6e16cda8..00000000 --- a/stack-showcase/pipelines/deployment.py +++ /dev/null @@ -1,38 +0,0 @@ -# {% include 'template/license_header' %} - -from typing import Optional, List - -from steps import ( - deploy_to_huggingface, -) -from zenml import get_pipeline_context, pipeline -from zenml.logger import get_logger -from zenml.client import Client - -logger = get_logger(__name__) - - -@pipeline -def breast_cancer_deployment_pipeline( - repo_name: Optional[str] = "zenml_breast_cancer_classifier", -): - """ - Model deployment pipeline. - - This pipelines deploys latest model on mlflow registry that matches - the given stage, to one of the supported deployment targets. - - Args: - labels: List of labels for the model. - title: Title for the model. - description: Description for the model. - model_name_or_path: Name or path of the model. - tokenizer_name_or_path: Name or path of the tokenizer. - interpretation: Interpretation for the model. - example: Example for the model. - repo_name: Name of the repository to deploy to HuggingFace Hub. - """ - ########## Deploy to HuggingFace ########## - deploy_to_huggingface( - repo_name=repo_name, - ) diff --git a/stack-showcase/pipelines/inference.py b/stack-showcase/pipelines/inference.py deleted file mode 100644 index d35634ed..00000000 --- a/stack-showcase/pipelines/inference.py +++ /dev/null @@ -1,50 +0,0 @@ -# {% include 'template/license_header' %} - -from typing import List, Optional - -from steps import ( - data_loader, - inference_preprocessor, - inference_predict, -) -from zenml import pipeline, ExternalArtifact -from zenml.logger import get_logger - -logger = get_logger(__name__) - - -@pipeline -def inference( - test_size: float = 0.2, - drop_na: Optional[bool] = None, - normalize: Optional[bool] = None, - drop_columns: Optional[List[str]] = None, -): - """ - Model training pipeline. - - This is a pipeline that loads the data, processes it and splits - it into train and test sets, then search for best hyperparameters, - trains and evaluates a model. - - Args: - test_size: Size of holdout set for training 0.0..1.0 - drop_na: If `True` NA values will be removed from dataset - normalize: If `True` dataset will be normalized with MinMaxScaler - drop_columns: List of columns to drop from dataset - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Link all the steps together by calling them and passing the output - # of one step as the input of the next step. - random_state = 60 - target = "target" - df_inference = data_loader(random_state=random_state, is_inference=True) - df_inference = inference_preprocessor( - dataset_inf=df_inference, - preprocess_pipeline=ExternalArtifact(name="preprocess_pipeline"), - target=target, - ) - inference_predict( - dataset_inf=df_inference, - ) - ### END CODE HERE ### diff --git a/stack-showcase/pipelines/training.py b/stack-showcase/pipelines/training.py deleted file mode 100644 index 13108cb8..00000000 --- a/stack-showcase/pipelines/training.py +++ /dev/null @@ -1,61 +0,0 @@ -# {% include 'template/license_header' %} - -from typing import Optional -from uuid import UUID - -from steps import model_evaluator, model_trainer, model_promoter -from zenml import ExternalArtifact, pipeline -from zenml.logger import get_logger - -from pipelines import ( - feature_engineering, -) - -logger = get_logger(__name__) - - -@pipeline(enable_cache=True) -def breast_cancer_training( - train_dataset_id: Optional[UUID] = None, - test_dataset_id: Optional[UUID] = None, - min_train_accuracy: float = 0.0, - min_test_accuracy: float = 0.0, -): - """ - Model training pipeline. - - This is a pipeline that loads the data, processes it and splits - it into train and test sets, then search for best hyperparameters, - trains and evaluates a model. - - Args: - test_size: Size of holdout set for training 0.0..1.0 - drop_na: If `True` NA values will be removed from dataset - normalize: If `True` dataset will be normalized with MinMaxScaler - drop_columns: List of columns to drop from dataset - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - # Link all the steps together by calling them and passing the output - # of one step as the input of the next step. - - # Execute Feature Engineering Pipeline - if train_dataset_id is None or test_dataset_id is None: - dataset_trn, dataset_tst = feature_engineering() - else: - dataset_trn = ExternalArtifact(id=train_dataset_id) - dataset_tst = ExternalArtifact(id=test_dataset_id) - - model = model_trainer( - dataset_trn=dataset_trn, - ) - - acc = model_evaluator( - model=model, - dataset_trn=dataset_trn, - dataset_tst=dataset_tst, - min_train_accuracy=min_train_accuracy, - min_test_accuracy=min_test_accuracy, - ) - - model_promoter(accuracy=acc) - ### END CODE HERE ### diff --git a/stack-showcase/run.ipynb b/stack-showcase/run.ipynb deleted file mode 100644 index 85a6c449..00000000 --- a/stack-showcase/run.ipynb +++ /dev/null @@ -1,979 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "081d5616", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", - "\u001b[?25l\u001b[2;36mFound existing ZenML repository at path \u001b[0m\n", - "\u001b[2;32m'/home/apenner/PycharmProjects/template-starter/template'\u001b[0m\u001b[2;36m.\u001b[0m\n", - "\u001b[2;32mโ ‹\u001b[0m\u001b[2;36m Initializing ZenML repository at \u001b[0m\n", - "\u001b[2;36m/home/apenner/PycharmProjects/template-starter/template.\u001b[0m\n", - "\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[32mโ ‹\u001b[0m Initializing ZenML repository at \n", - "/home/apenner/PycharmProjects/template-starter/template.\n", - "\n", - "\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1A\u001b[2K\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n", - "\u001b[2K\u001b[2;36mActive repository stack set to: \u001b[0m\u001b[2;32m'default'\u001b[0m.\n", - "\u001b[2K\u001b[32mโ ™\u001b[0m Setting the repository active stack to 'default'...t'...\u001b[0m\n", - "\u001b[1A\u001b[2K" - ] - } - ], - "source": [ - "!zenml init\n", - "!zenml stack set default" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "79f775f2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;35mNumExpr defaulting to 8 threads.\u001b[0m\n" - ] - } - ], - "source": [ - "# Do the imports at the top\n", - "\n", - "import random\n", - "from zenml import ExternalArtifact, pipeline, Model\n", - "from zenml.client import Client\n", - "from zenml.logger import get_logger\n", - "from uuid import UUID\n", - "\n", - "import os\n", - "from typing import Optional, List\n", - "\n", - "from zenml import pipeline\n", - "\n", - "from steps import (\n", - " data_loader,\n", - " data_preprocessor,\n", - " data_splitter,\n", - " model_evaluator,\n", - " model_trainer,\n", - " inference_predict,\n", - " inference_preprocessor\n", - ")\n", - "\n", - "logger = get_logger(__name__)\n", - "\n", - "client = Client()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b50a9537", - "metadata": {}, - "outputs": [], - "source": [ - "@pipeline\n", - "def feature_engineering(\n", - " test_size: float = 0.2,\n", - " drop_na: Optional[bool] = None,\n", - " normalize: Optional[bool] = None,\n", - " drop_columns: Optional[List[str]] = None,\n", - " target: Optional[str] = \"target\",\n", - "):\n", - " \"\"\"\n", - " Feature engineering pipeline.\n", - "\n", - " This is a pipeline that loads the data, processes it and splits\n", - " it into train and test sets.\n", - "\n", - " Args:\n", - " test_size: Size of holdout set for training 0.0..1.0\n", - " drop_na: If `True` NA values will be removed from dataset\n", - " normalize: If `True` dataset will be normalized with MinMaxScaler\n", - " drop_columns: List of columns to drop from dataset\n", - " target: Name of target column in dataset\n", - " \"\"\"\n", - " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", - " # Link all the steps together by calling them and passing the output\n", - " # of one step as the input of the next step.\n", - " raw_data = data_loader(random_state=random.randint(0, 100), target=target)\n", - " dataset_trn, dataset_tst = data_splitter(\n", - " dataset=raw_data,\n", - " test_size=test_size,\n", - " )\n", - " dataset_trn, dataset_tst, _ = data_preprocessor(\n", - " dataset_trn=dataset_trn,\n", - " dataset_tst=dataset_tst,\n", - " drop_na=drop_na,\n", - " normalize=normalize,\n", - " drop_columns=drop_columns,\n", - " target=target,\n", - " )\n", - " \n", - " return dataset_trn, dataset_tst" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "bc5feef4-7016-420e-9af9-2e87ff666f74", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_args = {}\n", - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"feature_engineering.yaml\")\n", - "fe_p_configured = feature_engineering.with_options(**pipeline_args)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "75cf3740-b2d8-4c4b-b91b-dc1637000880", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mfeature_engineering\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mReusing registered version: \u001b[0m\u001b[1;36m(version: 1)\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mNew model version \u001b[0m\u001b[1;36m34\u001b[1;35m was created.\u001b[0m\n", - "\u001b[1;35mExecuting a new run.\u001b[0m\n", - "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mDataset with 541 records loaded!\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has finished in \u001b[0m\u001b[1;36m6.777s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has finished in \u001b[0m\u001b[1;36m11.345s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m14.866s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mRun \u001b[0m\u001b[1;36mfeature_engineering-2023_12_06-09_08_46_821042\u001b[1;35m has finished in \u001b[0m\u001b[1;36m36.198s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/52874ade-f314-45ab-b9bf-e95fb29290b8/runs/9d9e49b1-d78f-478b-991e-da87b0560512/dag\u001b[0m\n" - ] - } - ], - "source": [ - "latest_run = fe_p_configured()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "69ade540", - "metadata": {}, - "outputs": [], - "source": [ - "@pipeline\n", - "def training(\n", - " train_dataset_id: Optional[UUID] = None,\n", - " test_dataset_id: Optional[UUID] = None,\n", - " min_train_accuracy: float = 0.0,\n", - " min_test_accuracy: float = 0.0,\n", - "):\n", - " \"\"\"\n", - " Model training pipeline.\n", - "\n", - " This is a pipeline that loads the data, processes it and splits\n", - " it into train and test sets, then search for best hyperparameters,\n", - " trains and evaluates a model.\n", - "\n", - " Args:\n", - " test_size: Size of holdout set for training 0.0..1.0\n", - " drop_na: If `True` NA values will be removed from dataset\n", - " normalize: If `True` dataset will be normalized with MinMaxScaler\n", - " drop_columns: List of columns to drop from dataset\n", - " \"\"\"\n", - " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", - " # Link all the steps together by calling them and passing the output\n", - " # of one step as the input of the next step.\n", - " \n", - " # Execute Feature Engineering Pipeline\n", - " if train_dataset_id is None or test_dataset_id is None:\n", - " dataset_trn, dataset_tst = feature_engineering()\n", - " else:\n", - " dataset_trn = ExternalArtifact(id=train_dataset_id)\n", - " dataset_tst = ExternalArtifact(id=test_dataset_id)\n", - " \n", - " model = model_trainer(\n", - " dataset_trn=dataset_trn,\n", - " )\n", - "\n", - " model_evaluator(\n", - " model=model,\n", - " dataset_trn=dataset_trn,\n", - " dataset_tst=dataset_tst,\n", - " min_train_accuracy=min_train_accuracy,\n", - " min_test_accuracy=min_test_accuracy,\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5b1f78df", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_args = {}\n", - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"training.yaml\")\n", - "fe_t_configured = training.with_options(**pipeline_args)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "acf306a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mtraining\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mRegistered new version: \u001b[0m\u001b[1;36m(version 2)\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mNew model version \u001b[0m\u001b[1;36m35\u001b[1;35m was created.\u001b[0m\n", - "\u001b[1;35mExecuting a new run.\u001b[0m\n", - "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mDataset with 541 records loaded!\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has finished in \u001b[0m\u001b[1;36m7.368s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_splitter\u001b[1;35m has finished in \u001b[0m\u001b[1;36m11.009s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m14.134s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mCaching \u001b[0m\u001b[1;36mdisabled\u001b[1;35m explicitly for \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mTraining model DecisionTreeClassifier()...\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_trainer\u001b[1;35m has finished in \u001b[0m\u001b[1;36m7.035s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_evaluator\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mTrain accuracy=100.00%\u001b[0m\n", - "\u001b[1;35mTest accuracy=92.66%\u001b[0m\n", - "\u001b[1;35mImplicitly linking artifact \u001b[0m\u001b[1;36moutput\u001b[1;35m to model \u001b[0m\u001b[1;36mbreast_cancer_classifier\u001b[1;35m version \u001b[0m\u001b[1;36m35\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mmodel_evaluator\u001b[1;35m has finished in \u001b[0m\u001b[1;36m6.050s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mRun \u001b[0m\u001b[1;36mtraining-2023_12_06-09_09_41_413455\u001b[1;35m has finished in \u001b[0m\u001b[1;36m51.278s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/787c6360-4499-4e2e-8d50-edaaa3956a6f/runs/2a335b9c-bb8e-425c-80e2-0a6cc0ffe56a/dag\u001b[0m\n" - ] - } - ], - "source": [ - "fe_t_configured()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "ad6aa280", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Optional\n", - "\n", - "import pandas as pd\n", - "from typing_extensions import Annotated\n", - "\n", - "from zenml import get_step_context, step\n", - "from zenml.logger import get_logger\n", - "\n", - "logger = get_logger(__name__)\n", - "\n", - "\n", - "@step\n", - "def inference_predict(\n", - " dataset_inf: pd.DataFrame,\n", - ") -> Annotated[pd.Series, \"predictions\"]:\n", - " \"\"\"Predictions step.\n", - "\n", - " This is an example of a predictions step that takes the data in and returns\n", - " predicted values.\n", - "\n", - " This step is parameterized, which allows you to configure the step\n", - " independently of the step code, before running it in a pipeline.\n", - " In this example, the step can be configured to use different input data.\n", - " See the documentation for more information:\n", - "\n", - " https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines\n", - "\n", - " Args:\n", - " dataset_inf: The inference dataset.\n", - "\n", - " Returns:\n", - " The predictions as pandas series\n", - " \"\"\"\n", - " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", - " zenml_model = get_step_context().model\n", - "\n", - " print(zenml_model)\n", - "\n", - " # run prediction from memory\n", - " predictor = zenml_model.load_artifact(\"model\")\n", - " predictions = predictor.predict(dataset_inf)\n", - "\n", - " print(predictions)\n", - " predictions = pd.Series(predictions, name=\"predicted\")\n", - " ### YOUR CODE ENDS HERE ###\n", - "\n", - " return predictions\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "517ad39d", - "metadata": {}, - "outputs": [], - "source": [ - "@pipeline\n", - "def batch_inference():\n", - " \"\"\"\n", - " Model batch inference pipeline.\n", - "\n", - " This is a pipeline that loads the inference data, processes\n", - " it, analyze for data drift and run inference.\n", - " \"\"\"\n", - " ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###\n", - " # Link all the steps together by calling them and passing the output\n", - " # of one step as the input of the next step.\n", - " ########## ETL stage ##########\n", - " random_state = client.get_artifact(\"dataset\").run_metadata[\"random_state\"].value\n", - " target = client.get_artifact(\"dataset_trn\").run_metadata['target'].value\n", - " df_inference = data_loader(\n", - " random_state=random_state, is_inference=True\n", - " )\n", - " df_inference = inference_preprocessor(\n", - " dataset_inf=df_inference,\n", - " preprocess_pipeline=ExternalArtifact(name=\"preprocess_pipeline\"),\n", - " target=target,\n", - " )\n", - " inference_predict(\n", - " dataset_inf=df_inference,\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "f0d9ebb6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;35m\u001b[0m\u001b[1;36mversion\u001b[1;35m \u001b[0m\u001b[1;36mproduction\u001b[1;35m matches one of the possible \u001b[0m\u001b[1;36mModelStages\u001b[1;35m and will be fetched using stage.\u001b[0m\n" - ] - } - ], - "source": [ - "pipeline_args = {}\n", - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"inference.yaml\")\n", - "fe_b_configured = batch_inference.with_options(**pipeline_args)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "9901c6d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mUsing an external artifact as step input currently invalidates caching for the step and all downstream steps. Future releases will introduce hashing of artifacts which will improve this behavior.\u001b[0m\n", - "\u001b[1;35mInitiating a new run for the pipeline: \u001b[0m\u001b[1;36mbatch_inference\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mReusing registered version: \u001b[0m\u001b[1;36m(version: 1)\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mExecuting a new run.\u001b[0m\n", - "\u001b[1;35mUsing user: \u001b[0m\u001b[1;36malexej@zenml.io\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mUsing stack: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m artifact_store: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35m orchestrator: \u001b[0m\u001b[1;36mdefault\u001b[1;35m\u001b[0m\n", - "\u001b[1;35mUsing cached version of \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36mdata_loader\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_preprocessor\u001b[1;35m has started.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_preprocessor\u001b[1;35m has finished in \u001b[0m\u001b[1;36m8.661s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_predict\u001b[1;35m has started.\u001b[0m\n", - "name='breast_cancer_classifier' license='Apache 2.0' description='Classification of Breast Cancer Dataset.' audience=None use_cases=None limitations=None trade_offs=None ethics=None tags=['classification', 'sklearn'] version='production' save_models_to_registry=True suppress_class_validation_warnings=True was_created_in_this_run=False\n", - "\u001b[33mYou specified both an ID as well as a version of the artifacts. Ignoring the version and fetching the artifacts by ID.\u001b[0m\n", - "\u001b[33mYour artifact was materialized under Python version 'unknown' but you are currently using '3.9.13'. This might cause unexpected behavior since pickle is not reproducible across Python versions. Attempting to load anyway...\u001b[0m\n", - "\u001b[33mCould not import Azure service connector: No module named 'azure.identity'.\u001b[0m\n", - "[1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 1 1 1]\n", - "\u001b[1;35mStep \u001b[0m\u001b[1;36minference_predict\u001b[1;35m has finished in \u001b[0m\u001b[1;36m18.218s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mRun \u001b[0m\u001b[1;36mbatch_inference-2023_12_06-09_11_29_924914\u001b[1;35m has finished in \u001b[0m\u001b[1;36m32.726s\u001b[1;35m.\u001b[0m\n", - "\u001b[1;35mDashboard URL: https://1cf18d95-zenml.cloudinfra.zenml.io/workspaces/default/pipelines/2979acb2-c862-480a-8f50-a2be4c76a8a2/runs/7886e370-b05a-4205-931e-e4994fabd897/dag\u001b[0m\n" - ] - } - ], - "source": [ - "fe_b_configured()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98d39df8", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "51690802-31a7-4e6d-9f88-e6457c6c4a96", - "metadata": {}, - "source": [ - "# Huggingface Model to Sagemaker Endpoint: Automating MLOps with ZenML\n", - "Deploying Huggingface models to AWS Sagemaker endpoints typically only requires a few lines of code. However, there's a growing demand to not just deploy, but to seamlessly automate the entire flow from training to production with comprehensive lineage tracking. ZenML adeptly fills this niche, providing an end-to-end MLOps solution for Huggingface users wishing to deploy to Sagemaker. Below, weโ€™ll walk through the architecture that ZenML employs to bring a Huggingface model into production with AWS Sagemaker. Of course all of this can be adapted to not just Sagemaker, but any other model deployment service like GCP Vertex or Azure ML Platform.\n", - "\n", - "This blog post showcases one way of using ZenML pipelines to achieve this:\n", - "\n", - "- Create and version a dataset in a feature_engineering_pipeline.\n", - "- Train/Finetune a BERT-based Sentiment Analysis NLP model and push to Huggingface Hub in a training_pipeline.\n", - "- Promote this model to Production by comparing to previous models in a promotion_pipeline.\n", - "- Deploy the model at the Production Stage to a AWS Sagemaker endpoint with a deployment_pipeline.\n", - "\n", - "\"Pipelines" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "500e3c24-b105-4a69-b2fc-e0ce1f1c1d46", - "metadata": {}, - "outputs": [], - "source": [ - "# Do the imports at the top\n", - "\n", - "import numpy as np\n", - "from datasets import DatasetDict, load_dataset\n", - "from typing_extensions import Annotated\n", - "from zenml import step\n", - "from zenml.logger import get_logger\n", - "\n", - "import os\n", - "from typing import Optional\n", - "from datetime import datetime as dt\n", - "\n", - "from zenml import pipeline\n", - "from zenml import Model\n", - "\n", - "from steps import (\n", - " data_loader,\n", - " notify_on_failure,\n", - " tokenization_step,\n", - " tokenizer_loader,\n", - " generate_reference_and_comparison_datasets,\n", - ")\n", - "from zenml.integrations.evidently.metrics import EvidentlyMetricConfig\n", - "from zenml.integrations.evidently.steps import (\n", - " EvidentlyColumnMapping,\n", - " evidently_report_step,\n", - ")\n", - "\n", - "from pipelines import (\n", - " sentinment_analysis_deploy_pipeline,\n", - " sentinment_analysis_promote_pipeline,\n", - " sentinment_analysis_training_pipeline,\n", - ")\n", - "\n", - "logger = get_logger(__name__)" - ] - }, - { - "cell_type": "markdown", - "id": "fc77b660-e206-46b1-a924-407e797a8f47", - "metadata": {}, - "source": [ - "# ๐ŸณBreaking it down\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "31edaf46-6981-42be-99b7-9bdd91c160d5", - "metadata": {}, - "source": [ - "## ๐Ÿ‘ถ Step 1: Start with feature engineering\n", - "\n", - "Automated feature engineering forms the foundation of this MLOps workflow. Thats why the first pipeline is the feature engineering pipeline. This pipeline loads some data from Huggingface and uses a base tokenizer to create a tokenized dataset. The data loader step is a simple Python function that returns a Huggingface dataloader object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35de0e4c-b6f8-4b68-927a-f40e4130dc93", - "metadata": {}, - "outputs": [], - "source": [ - "@step\n", - "def data_loader() -> Annotated[DatasetDict, \"dataset\"]:\n", - " logger.info(f\"Loading dataset airline_reviews... \")\n", - " hf_dataset = load_dataset(\"Shayanvsf/US_Airline_Sentiment\")\n", - " hf_dataset = hf_dataset.rename_column(\"airline_sentiment\", \"label\")\n", - " hf_dataset = hf_dataset.remove_columns(\n", - " [\"airline_sentiment_confidence\", \"negativereason_confidence\"]\n", - " )\n", - " return hf_dataset" - ] - }, - { - "cell_type": "markdown", - "id": "49e4462c-1e64-48d3-bae7-76696a958646", - "metadata": {}, - "source": [ - "Notice that you can give each dataset a name with Pythonโ€™s Annotated object. The DatasetDict is a native Huggingface dataset which ZenML knows how to persist through steps. This flow ensures reproducibility and version control for every dataset iteration.\n", - "\n", - "Also notice this is a simple Python function, that can be called with the `entrypoint` wrapper:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18144a6b-c266-453d-82c8-b5d6aa1be0aa", - "metadata": {}, - "outputs": [], - "source": [ - "hf_dataset = data_loader.entrypoint()\n", - "print(hf_dataset)" - ] - }, - { - "cell_type": "markdown", - "id": "31330d3c-044f-4912-8d36-74146f48cecf", - "metadata": {}, - "source": [ - "Now we put this a full feature engineering pipeline. Each run of the feature engineering pipeline produces a new dataset to use for the training pipeline. ZenML versions this data as it flows through the pipeline.\n", - "\n", - "\"Pipelines" - ] - }, - { - "cell_type": "markdown", - "id": "9511bd84-1e97-42db-9b75-06285cc6904c", - "metadata": {}, - "source": [ - "### Set your stack" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76f3a7e7-0d85-43b3-9e9f-4c7f20ea65e6", - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack describe hf-sagemaker-local" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04b0bf69-70c6-4408-b18c-95df9e030c0c", - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack set hf-sagemaker-local" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de5398a4-a9ec-42d6-bbd6-390244c52d13", - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack get" - ] - }, - { - "cell_type": "markdown", - "id": "152f718d-70c2-4a29-a73e-37db85675cb8", - "metadata": {}, - "source": [ - "### Run the pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ca6c41e-e4b3-46d2-8264-9a453ac9aa3c", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "@pipeline(on_failure=notify_on_failure)\n", - "def sentinment_analysis_feature_engineering_pipeline(\n", - " lower_case: Optional[bool] = True,\n", - " padding: Optional[str] = \"max_length\",\n", - " max_seq_length: Optional[int] = 128,\n", - " text_column: Optional[str] = \"text\",\n", - " label_column: Optional[str] = \"label\",\n", - "):\n", - " # Link all the steps together by calling them and passing the output\n", - " # of one step as the input of the next step.\n", - "\n", - " ########## Load Dataset stage ##########\n", - " dataset = data_loader()\n", - "\n", - " ########## Data Quality stage ##########\n", - " reference_dataset, comparison_dataset = generate_reference_and_comparison_datasets(\n", - " dataset\n", - " )\n", - " text_data_report = evidently_report_step.with_options(\n", - " parameters=dict(\n", - " column_mapping=EvidentlyColumnMapping(\n", - " target=\"label\",\n", - " text_features=[\"text\"],\n", - " ),\n", - " metrics=[\n", - " EvidentlyMetricConfig.metric(\"DataQualityPreset\"),\n", - " EvidentlyMetricConfig.metric(\n", - " \"TextOverviewPreset\", column_name=\"text\"\n", - " ),\n", - " ],\n", - " # We need to download the NLTK data for the TextOverviewPreset\n", - " download_nltk_data=True,\n", - " ),\n", - " )\n", - " text_data_report(reference_dataset, comparison_dataset)\n", - "\n", - " ########## Tokenization stage ##########\n", - " tokenizer = tokenizer_loader(lower_case=lower_case)\n", - " tokenized_data = tokenization_step(\n", - " dataset=dataset,\n", - " tokenizer=tokenizer,\n", - " padding=padding,\n", - " max_seq_length=max_seq_length,\n", - " text_column=text_column,\n", - " label_column=label_column,\n", - " )\n", - " return tokenizer, tokenized_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c8a5be7-ebaa-41c4-ac23-4afc6e7e06aa", - "metadata": {}, - "outputs": [], - "source": [ - "# Run a pipeline with the required parameters. \n", - "no_cache: bool = True\n", - "zenml_model_name: str = \"distil_bert_sentiment_analysis\"\n", - "max_seq_length = 512\n", - "\n", - "# This executes all steps in the pipeline in the correct order using the orchestrator\n", - "# stack component that is configured in your active ZenML stack.\n", - "zenml_model = Model(\n", - " name=zenml_model_name,\n", - " license=\"Apache 2.0\",\n", - " description=\"Show case Model Control Plane.\",\n", - " tags=[\"sentiment_analysis\", \"huggingface\"],\n", - ")\n", - "\n", - "pipeline_args = {}\n", - "\n", - "if no_cache:\n", - " pipeline_args[\"enable_cache\"] = False\n", - "\n", - "# Execute Feature Engineering Pipeline\n", - "pipeline_args[\"model\"] = zenml_model\n", - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"feature_engineering_config.yaml\")\n", - "run_args_feature = {\n", - " \"max_seq_length\": max_seq_length,\n", - "}\n", - "pipeline_args[\n", - " \"run_name\"\n", - "] = f\"sentinment_analysis_feature_engineering_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"\n", - "p = sentinment_analysis_feature_engineering_pipeline.with_options(**pipeline_args)\n", - "p(**run_args_feature)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e7c1ea2-64fe-478a-9963-17c7b7f62110", - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.client import Client\n", - "from IPython.display import display, HTML\n", - "\n", - "client = Client()\n", - "# CHANGE THIS TO THE LATEST RUN ID\n", - "latest_run = client.get_pipeline_run(\"sentinment_analysis_feature_engineering_pipeline_run_2023_11_21_10_55_56\")\n", - "html = latest_run.steps[\"evidently_report_step\"].outputs['report_html'].load()\n", - "display(HTML(html))" - ] - }, - { - "cell_type": "markdown", - "id": "78ab8771-4421-4975-a3d5-12892a56b805", - "metadata": {}, - "source": [ - "## ๐Ÿ’ช Step 2: Train the model with Huggingface Hub as the model registry\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "2843efa8-32b6-4b13-ac85-33c99cc94e3e", - "metadata": {}, - "source": [ - "Once the feature engineering pipeline has run a few times, we have many datasets to choose from. We can feed our desired one into a function that trains the model on the data. Thanks to the ZenML Huggingface integration, this data is loaded directly from the ZenML artifact store.\n", - "\n", - "\"Pipelines\n", - "\n", - "On the left side, we see our local MLOps stack, which defines our infrastructure and tooling we are using for this particular pipeline. ZenML makes it easy to run on a local stack on your development machine, or switch out the stack to run on a AWS Kubeflow-based stack (if you want to scale up).\n", - "\n", - "On the right side is the new kid on the block - the ZenML Model Control Plane. The Model Control Plane is a new feature in ZenML that allows users to have a complete overview of their machine learning models. It allows teams to consolidate all artifacts related to their ML models into one place, and manage its lifecycle easily as you can see from this view from the ZenML Cloud:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c99b20f-8e3b-4119-86e9-33dd1395470a", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"trainer_config.yaml\")\n", - "\n", - "pipeline_args[\"enable_cache\"] = True\n", - "\n", - "run_args_train = {\n", - " \"num_epochs\": 1,\n", - " \"train_batch_size\": 64,\n", - " \"eval_batch_size\": 64,\n", - " \"learning_rate\": 2e-4,\n", - " \"weight_decay\": 0.01,\n", - " \"max_seq_length\": 512,\n", - "}\n", - "\n", - "# Use versioned artifacts from the last step\n", - "# run_args_train[\"dataset_artifact_id\"] = latest_run.steps['tokenization_step'].output.id\n", - "# run_args_train[\"tokenizer_artifact_id\"] = latest_run.steps['tokenizer_loader'].output.id\n", - "\n", - "# Configure the model\n", - "pipeline_args[\"model\"] = zenml_model\n", - "\n", - "pipeline_args[\n", - " \"run_name\"\n", - "] = f\"sentinment_analysis_training_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96592299-0090-4d2a-962e-6ca232c1fb75", - "metadata": {}, - "outputs": [], - "source": [ - "sentinment_analysis_training_pipeline.with_options(**pipeline_args)(\n", - " **run_args_train\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e24e29de-6d1b-41da-9ab2-ca2b32f1f540", - "metadata": {}, - "outputs": [], - "source": [ - "### Check out a new stack\n", - "!zenml stack describe hf-sagemaker-airflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c9a5bee-8465-4d41-888a-093f1f6a2ef1", - "metadata": {}, - "outputs": [], - "source": [ - "### Change the stack\n", - "!zenml stack set hf-sagemaker-airflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3772c50-1c90-4ffc-8394-c9cfca16cc53", - "metadata": {}, - "outputs": [], - "source": [ - "sentinment_analysis_training_pipeline.with_options(**pipeline_args)(\n", - " **run_args_train\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "be79f454-a45d-4f5f-aa93-330d52069124", - "metadata": {}, - "source": [ - "## ๐Ÿซ… Step 3: Promote the model to production\n" - ] - }, - { - "cell_type": "markdown", - "id": "5a09b432-7a66-473e-bdb6-ffdca730498b", - "metadata": {}, - "source": [ - "Following training, the automated promotion pipeline evaluates models against predefined metrics, identifying and marking the most performant one as 'Production ready'. This is another common use case for the Model Control Plane; we store the relevant metrics there to access them easily later.\n", - "\n", - "\"Pipelines" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5bac7ae5-70d0-449c-929c-e175c3062f2d", - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack set hf-sagemaker-local" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "170c9ef6-4e6f-4e50-ac37-e05bef8570ea", - "metadata": {}, - "outputs": [], - "source": [ - "run_args_promoting = {}\n", - "zenml_model = Model(name=zenml_model_name)\n", - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"promoting_config.yaml\")\n", - "\n", - "pipeline_args[\"model\"] = zenml_model\n", - "\n", - "pipeline_args[\n", - " \"run_name\"\n", - "] = f\"sentinment_analysis_promoting_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6df11e2-4591-4186-a8f8-243f9c4d1e3d", - "metadata": {}, - "outputs": [], - "source": [ - "sentinment_analysis_promote_pipeline.with_options(**pipeline_args)(\n", - " **run_args_promoting\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6efc4968-35fd-42e3-ba62-d8e1557aa0d6", - "metadata": {}, - "source": [ - "## ๐Ÿ’ฏ Step 4: Deploy the model to AWS Sagemaker Endpoints\n" - ] - }, - { - "cell_type": "markdown", - "id": "577aff86-bde9-48d4-9b52-209cfed9fd4e", - "metadata": {}, - "source": [ - "This is the final step to automate the deployment of the slated production model to a Sagemaker endpoint. The deployment pipelines handles the complexities of AWS interactions and ensures that the model, along with its full history and context, is transitioned into a live environment ready for use. Here again we use the Model Control Plane interface to query the Huggingface revision and use that information to push to Huggingface Hub.\n", - "\n", - "\"Pipelines\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1513ab5f-de05-4344-9d2c-fedbfbd21ef0", - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack set hf-sagemaker-local" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "606fdb3c-4eca-4d32-bccb-280743d15528", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_args[\"config_path\"] = os.path.join(\"configs\", \"deploying_config.yaml\")\n", - "\n", - "# Deploying pipeline has new ZenML model config\n", - "zenml_model = Model(\n", - " name=zenml_model_name,\n", - " version=\"production\",\n", - ")\n", - "pipeline_args[\"model\"] = zenml_model\n", - "pipeline_args[\"enable_cache\"] = False\n", - "run_args_deploying = {}\n", - "pipeline_args[\n", - " \"run_name\"\n", - "] = f\"sentinment_analysis_deploy_pipeline_run_{dt.now().strftime('%Y_%m_%d_%H_%M_%S')}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87f1f982-ab96-4207-8e7e-e318473587e9", - "metadata": {}, - "outputs": [], - "source": [ - "sentinment_analysis_deploy_pipeline.with_options(**pipeline_args)(\n", - " **run_args_deploying\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", - "metadata": {}, - "source": [ - "ZenML builds upon the straightforward deployment capability of Huggingface models to AWS Sagemaker, and transforms it into a sophisticated, repeatable, and transparent MLOps workflow. It takes charge of the intricate steps necessary for modern ML systems, ensuring that software engineering leads can focus on iteration and innovation rather than operational intricacies.\n", - "\n", - "To delve deeper into each stage, refer to the comprehensive guide on GitHub[: zenml-io/zenml-huggingface-sagemak](https://github.com/zenml-io/zenml-huggingface-sagemaker)er. Additionally[, this YouTube playli](https://www.youtube.com/watch?v=Q1EH2H8Akgo&list=PLhNrLW_IWplw6dBbmGcL828-atJMu3CwF)st provides a detailed visual walkthrough of the entire pipeline: Huggingface to Sagemaker ZenML tutorial.\n", - "\n", - "Interested in standardizing your MLOps workflows? ZenML Cloud is now available to all - get a managed ZenML server with important features such as RBAC and pipeline trigge[rs. Book a ](https://zenml.io/book-a-demo)demo with us now to learn how you can create your own MLOps pipelines today." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/stack-showcase/run_deploy.ipynb b/stack-showcase/run_deploy.ipynb deleted file mode 100644 index 281f7507..00000000 --- a/stack-showcase/run_deploy.ipynb +++ /dev/null @@ -1,175 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "63ab391a", - "metadata": {}, - "source": [ - "# Intro to MLOps using ZenML\n", - "\n", - "## ๐ŸŒ Overview\n", - "\n", - "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n", - "\n", - "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n", - "\n", - "\"Pipelines" - ] - }, - { - "cell_type": "markdown", - "id": "8c28b474", - "metadata": {}, - "source": [ - "# โŒš Step 1: (Feature engineering) + Training pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "8e5a76e6-8655-47d5-ab61-015b2d69d720", - "metadata": {}, - "source": [ - "Lets run the feature engineering pipeline\n", - "\n", - "\"Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "942a20f9-244b-4761-933e-55989a7377d6", - "metadata": {}, - "outputs": [], - "source": [ - "!python run.py --feature-pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "87909827", - "metadata": {}, - "source": [ - "Lets run the training pipeline\n", - "\n", - "\"Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fccf1bd9", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!python run.py --training-pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "d6306f14", - "metadata": {}, - "source": [ - "# ๐Ÿซ… Step 2: The inference pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "b51f3108", - "metadata": {}, - "source": [ - "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", - "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", - "and generate predictions:\n", - "\n", - "\"Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9918a8a1-c569-494f-aa40-cb7bd3aaea07", - "metadata": {}, - "outputs": [], - "source": [ - "!python run.py --inference-pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "36140d24-a280-48eb-bb03-5e03280e128c", - "metadata": {}, - "source": [ - "## Step 3: Deploying the pipeline to Huggingface" - ] - }, - { - "cell_type": "markdown", - "id": "13bd8087-2ab0-4f9d-8bff-6266a05eb6e7", - "metadata": {}, - "source": [ - "\"Deployment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8000849c-1ce8-4900-846e-3ef1873561f8", - "metadata": {}, - "outputs": [], - "source": [ - "!python run.py --deployment-pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", - "metadata": {}, - "source": [ - "## Congratulations!\n", - "\n", - "You're a legit MLOps engineer now! You trained two models, evaluated them against\n", - "a test set, registered the best one with the ZenML model control plane,\n", - "and served some predictions. You also learned how to iterate on your models and\n", - "data by using some of the ZenML utility abstractions. You saw how to view your\n", - "artifacts and models via the client as well as the ZenML Dashboard.\n", - "\n", - "## Further exploration\n", - "\n", - "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", - "about the capabilities of ZenML. For example, you might want to:\n", - "\n", - "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/deploying-zenml#connecting-to-a-deployed-zenml) to collaborate with your colleagues.\n", - "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n", - "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", - "\n", - "## What next?\n", - "\n", - "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", - "* If you want to quickly get started with ZenML, check out the [ZenML Cloud](https://zenml.io/cloud)." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/stack-showcase/run_stack_showcase.ipynb b/stack-showcase/run_stack_showcase.ipynb deleted file mode 100644 index 92e6dd7e..00000000 --- a/stack-showcase/run_stack_showcase.ipynb +++ /dev/null @@ -1,347 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "b567a1d3-f625-4b98-9852-fcc3f3fe9609", - "metadata": {}, - "outputs": [], - "source": [ - "# To start with, we use the default stack\n", - "#!zenml init\n", - "\n", - "# We also need to connect to a remote ZenML Instance\n", - "#!zenml connect --url https://1cf18d95-zenml.cloudinfra.zenml.io" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c53367f1-3951-48c7-9540-21daf818fa5d", - "metadata": {}, - "outputs": [], - "source": [ - "# Do the imports at the top\n", - "\n", - "import random\n", - "from zenml import ExternalArtifact, pipeline \n", - "from zenml.client import Client\n", - "from zenml.logger import get_logger\n", - "from uuid import UUID\n", - "\n", - "import os\n", - "from typing import Optional, List\n", - "\n", - "from zenml import pipeline\n", - "from zenml import Model\n", - "\n", - "from pipelines import feature_engineering\n", - "\n", - "from steps import (\n", - " data_loader,\n", - " data_preprocessor,\n", - " data_splitter,\n", - " model_evaluator,\n", - " model_trainer,\n", - " inference_predict,\n", - " inference_preprocessor\n", - ")\n", - "\n", - "logger = get_logger(__name__)\n", - "\n", - "client = Client()\n", - "client.activate_stack(\"local-mlflow-stack\")" - ] - }, - { - "cell_type": "markdown", - "id": "ab87746e-b804-4fab-88f6-d4967048cb45", - "metadata": {}, - "source": [ - "# Start local with a simple training pipeline\n", - "\n", - "Below you can see what the pipeline looks like. We will start by running this locally on the default-stack. This means the data between the steps is stored locally and the compute is also local." - ] - }, - { - "cell_type": "markdown", - "id": "33872b19-7329-4f5e-9a1e-cfc1fe9d560d", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "\"Drawing\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06625571-b281-4820-a7eb-3a085ba2e572", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.datasets import load_breast_cancer\n", - "from zenml import step\n", - "from zenml.logger import get_logger\n", - "\n", - "logger = get_logger(__name__)\n", - "\n", - "# Here is what one of the steps in the pipeline looks like. Simple python function that just needs the `@step` decorator.\n", - "\n", - "@step\n", - "def data_loader() -> pd.DataFrame:\n", - " \"\"\"Dataset reader step.\"\"\"\n", - " dataset = load_breast_cancer(as_frame=True)\n", - " inference_size = int(len(dataset.target) * 0.05)\n", - " dataset: pd.DataFrame = dataset.frame\n", - " dataset.reset_index(drop=True, inplace=True)\n", - " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", - "\n", - " return dataset\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "754a3069-9d13-4869-be64-a641071800cc", - "metadata": {}, - "outputs": [], - "source": [ - "# Here's an example of what this function returns\n", - "\n", - "data_loader()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8aa300f1-48df-4e62-87eb-0e2fc5735da8", - "metadata": {}, - "outputs": [], - "source": [ - "from zenml import pipeline\n", - "\n", - "@pipeline\n", - "def breast_cancer_training(\n", - " train_dataset_id: Optional[UUID] = None,\n", - " test_dataset_id: Optional[UUID] = None,\n", - " min_train_accuracy: float = 0.0,\n", - " min_test_accuracy: float = 0.0,\n", - "):\n", - " \"\"\"Model training pipeline.\"\"\"\n", - " # Execute Feature Engineering Pipeline\n", - " dataset_trn, dataset_tst = feature_engineering()\n", - "\n", - " model = model_trainer(\n", - " dataset_trn=dataset_trn,\n", - " )\n", - "\n", - " model_evaluator(\n", - " model=model,\n", - " dataset_trn=dataset_trn,\n", - " dataset_tst=dataset_tst,\n", - " min_train_accuracy=min_train_accuracy,\n", - " min_test_accuracy=min_test_accuracy,\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d55342bf-33c5-4646-b1ce-e599a99cf568", - "metadata": {}, - "outputs": [], - "source": [ - "zenml_model = Model(\n", - " name=\"breast_cancer_classifier_model\",\n", - " description=\"Classification of Breast Cancer Dataset.\",\n", - " delete_new_version_on_failure=True,\n", - " tags=[\"classification\", \"sklearn\"],\n", - ")\n", - "\n", - "pipeline_args = {\n", - " \"enable_cache\": True, \n", - " \"model\": zenml_model\n", - "}\n", - "\n", - "# Model Version config\n", - "fe_t_configured = breast_cancer_training.with_options(**pipeline_args)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5f4aed8-7d87-4e07-a25c-345d327ad636", - "metadata": {}, - "outputs": [], - "source": [ - "fe_t_configured()" - ] - }, - { - "cell_type": "markdown", - "id": "c3e6dc42-21b8-4b3c-90ec-d6e6d541907f", - "metadata": {}, - "source": [ - "# Let's outsource some compute to Sagemaker!" - ] - }, - { - "cell_type": "markdown", - "id": "14a840b1-288d-4713-98f4-bbe8d6e06140", - "metadata": {}, - "source": [ - "Let's farm some compute to AWS with a training job with a certain number of CPUs and Memory. This can easily be done without and changes to the actual implementation of the pipeline. " - ] - }, - { - "cell_type": "markdown", - "id": "fa9308fb-3556-472c-8fc7-7f2f88d1c455", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\"Drawing\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48be8f60-9fbe-4d19-92e4-d9cd8289dbf7", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# This pip installs the requirements locally\n", - "!zenml integration install aws s3 mlflow -y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4cb26018-aa7d-497d-a0e2-855d3becb70d", - "metadata": {}, - "outputs": [], - "source": [ - "client.activate_stack(\"local-sagemaker-step-operator-stack\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5683a1c9-f5c1-4ba1-ad7c-1e427fd265df", - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.config import DockerSettings\n", - "\n", - "# The actual code will stay the same, all that needs to be done is some configuration\n", - "step_args = {}\n", - "\n", - "# We configure which step operator should be used\n", - "step_args[\"step_operator\"] = \"sagemaker-eu\"\n", - "\n", - "# M5 Large is what we need for this big data!\n", - "step_args[\"settings\"] = {\"step_operator.sagemaker\": {\"estimator_args\": {\"instance_type\" : \"ml.m5.large\"}}}\n", - "\n", - "# Update the step. We could also do this in YAML\n", - "model_trainer = model_trainer.with_options(**step_args)\n", - "\n", - "docker_settings = DockerSettings(\n", - " requirements=[\n", - " \"pyarrow\",\n", - " \"scikit-learn==1.1.1\"\n", - " ],\n", - ")\n", - "\n", - "pipeline_args = {\n", - " \"enable_cache\": True, \n", - " \"model\": zenml_model,\n", - " \"settings\": {\"docker\": docker_settings}\n", - "}\n", - "\n", - "fe_t_configured = breast_cancer_training.with_options(**pipeline_args)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85179f52-68f0-4c8d-9808-6b080bec72c3", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Lets run the pipeline\n", - "fe_t_configured()" - ] - }, - { - "cell_type": "markdown", - "id": "0841f93b-9eb5-4af6-bba7-cec167024ccf", - "metadata": {}, - "source": [ - "# Switch to full Sagemaker Stack\n", - "\n", - "Just one command will allow you to switch the full code execution over to sagemaker. No Sagemaker domain knowledge necessary. No setup of VMs or Kubernetes clusters necessary. No maintenance of any infrastructure either.\n", - "\n", - "![Sagemaker local stack](_assets/sagemaker_stack.png)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8e33484-3377-4f0e-83fa-87d7c0ca4d72", - "metadata": {}, - "outputs": [], - "source": [ - "# Finally, this is all that needs to be done to fully switch the code to be run fully on sagemaker\n", - "client.activate_stack(\"sagemaker-stack\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a03c95e9-df2e-446c-8d61-9cc37ad8a46a", - "metadata": {}, - "outputs": [], - "source": [ - "fe_t_configured()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/stack-showcase/steps/deploy_to_huggingface.py b/stack-showcase/steps/deploy_to_huggingface.py deleted file mode 100644 index 888bd397..00000000 --- a/stack-showcase/steps/deploy_to_huggingface.py +++ /dev/null @@ -1,69 +0,0 @@ -# Apache Software License 2.0 -# -# Copyright (c) ZenML GmbH 2023. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from typing_extensions import Annotated -from huggingface_hub import create_branch, login, HfApi - -from zenml import step, log_artifact_metadata -from zenml.client import Client -from zenml.logger import get_logger - -# Initialize logger -logger = get_logger(__name__) - - -@step(enable_cache=False) -def deploy_to_huggingface( - repo_name: str, -) -> Annotated[str, "huggingface_url"]: - """ - This step deploy the model to huggingface. - - Args: - repo_name: The name of the repo to create/use on huggingface. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - secret = Client().get_secret("huggingface_creds") - assert secret, "No secret found with name 'huggingface_creds'. Please create one that includes your `username` and `token`." - token = secret.secret_values["token"] - api = HfApi(token=token) - hf_repo = api.create_repo(repo_id=repo_name, repo_type="space", space_sdk="gradio", exist_ok=True) - zenml_repo_root = Client().root - if not zenml_repo_root: - logger.warning( - "You're running the `deploy_to_huggingface` step outside of a ZenML repo. " - "Since the deployment step to huggingface is all about pushing the repo to huggingface, " - "this step will not work outside of a ZenML repo where the gradio folder is present." - ) - raise - url = api.upload_folder( - folder_path=zenml_repo_root, repo_id=hf_repo.repo_id, repo_type="space", - ) - repo_commits = api.list_repo_commits( - repo_id=hf_repo.repo_id, - repo_type="space", - ) - log_artifact_metadata( - artifact_name="huggingface_url", - metadata={ - "repo_id": hf_repo.repo_id, - "revision": repo_commits[0].commit_id, - }, - ) - logger.info(f"Model updated: {url}") - ### YOUR CODE ENDS HERE ### - return url diff --git a/stack-showcase/steps/model_promoter.py b/stack-showcase/steps/model_promoter.py deleted file mode 100644 index 04b9ce2a..00000000 --- a/stack-showcase/steps/model_promoter.py +++ /dev/null @@ -1,42 +0,0 @@ -# {% include 'template/license_header' %} - -from zenml import get_step_context, step -from zenml.logger import get_logger - -logger = get_logger(__name__) - - -@step -def model_promoter(accuracy: float, stage: str = "production") -> bool: - """Dataset reader step. - - This is an example of a dataset reader step that load Breast Cancer dataset. - - This step is parameterized, which allows you to configure the step - independently of the step code, before running it in a pipeline. - In this example, the step can be configured with number of rows and logic - to drop target column or not. See the documentation for more information: - - https://docs.zenml.io/user-guide/advanced-guide/configure-steps-pipelines - - Args: - accuracy: Accuracy of the model. - stage: Which stage to promote the model to. - - Returns: - Whether the model was promoted or not. - """ - ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### - if accuracy < 0.8: - logger.info( - f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." - ) - is_promoted = False - else: - logger.info(f"Model promoted to {stage}!") - is_promoted = True - zenml_model = get_step_context().model - zenml_model.set_stage(stage, force=True) - - ### YOUR CODE ENDS HERE ### - return is_promoted diff --git a/stack-showcase/steps/model_trainer.py b/stack-showcase/steps/model_trainer.py deleted file mode 100644 index e278ad61..00000000 --- a/stack-showcase/steps/model_trainer.py +++ /dev/null @@ -1,52 +0,0 @@ -# {% include 'template/license_header' %} - -import mlflow -import pandas as pd -from sklearn.base import ClassifierMixin -from sklearn.tree import DecisionTreeClassifier -from typing_extensions import Annotated -from zenml import ArtifactConfig, step -from zenml.client import Client -from zenml.logger import get_logger - -logger = get_logger(__name__) - -experiment_tracker = Client().active_stack.experiment_tracker - -@step(enable_cache=False, experiment_tracker="mlflow", step_operator="sagemaker-eu") -def model_trainer( - dataset_trn: pd.DataFrame, -) -> Annotated[ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True)]: - """Configure and train a model on the training dataset. - - This is an example of a model training step that takes in a dataset artifact - previously loaded and pre-processed by other steps in your pipeline, then - configures and trains a model on it. The model is then returned as a step - output artifact. - - Args: - dataset_trn: The preprocessed train dataset. - target: The name of the target column in the dataset. - - Returns: - The trained model artifact. - """ - # Use the dataset to fetch the target - # context = get_step_context() - # target = context.inputs["dataset_trn"].run_metadata['target'].value - target = "target" - - # Initialize the model with the hyperparameters indicated in the step - # parameters and train it on the training set. - model = DecisionTreeClassifier() - logger.info(f"Training model {model}...") - - model.fit( - dataset_trn.drop(columns=[target]), - dataset_trn[target], - ) - - mlflow.sklearn.log_model(model, "breast_cancer_classifier_model") - mlflow.sklearn.autolog() - - return model