zenml-io · avishniakov · Feb 19, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -103,7 +103,7 @@ celerybeat.pid
 
 # Environments
 .env
-.venv
+.venv*
 env/
 venv/
 ENV/
@@ -142,3 +142,5 @@ mlruns/
 
 zencoder/cloned_public_repos
 *wandb*
+
+.DS_Store
diff --git a/stack-showcase/.dockerignore → classifier-e2e/.dockerignore b/stack-showcase/.dockerignore → classifier-e2e/.dockerignore
diff --git a/stack-showcase/README.md → classifier-e2e/README.md b/stack-showcase/README.md → classifier-e2e/README.md
diff --git a/classifier-e2e/_assets/cloud_mcp.png b/classifier-e2e/_assets/cloud_mcp.png
diff --git a/classifier-e2e/_assets/cloud_mcp_predictions.png b/classifier-e2e/_assets/cloud_mcp_predictions.png
diff --git a/classifier-e2e/_assets/cloud_mcp_screenshot.png b/classifier-e2e/_assets/cloud_mcp_screenshot.png
diff --git a/classifier-e2e/_assets/deployment_pipeline.png b/classifier-e2e/_assets/deployment_pipeline.png
diff --git a/classifier-e2e/_assets/feature_engineering_pipeline.png b/classifier-e2e/_assets/feature_engineering_pipeline.png
diff --git a/classifier-e2e/_assets/inference_pipeline.png b/classifier-e2e/_assets/inference_pipeline.png
diff --git a/classifier-e2e/_assets/pipeline_overview.png b/classifier-e2e/_assets/pipeline_overview.png
diff --git a/classifier-e2e/_assets/sagemaker_stack.png b/classifier-e2e/_assets/sagemaker_stack.png
diff --git a/classifier-e2e/_assets/training_pipeline.png b/classifier-e2e/_assets/training_pipeline.png
diff --git a/classifier-e2e/configs/feature_engineering.yaml b/classifier-e2e/configs/feature_engineering.yaml
@@ -0,0 +1,10 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# pipeline configuration
+test_size: 0.35
diff --git a/classifier-e2e/configs/inference.yaml b/classifier-e2e/configs/inference.yaml
@@ -0,0 +1,12 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: "breast_cancer_classifier"
+  version: "production"
diff --git a/stack-showcase/configs/training.yaml → classifier-e2e/configs/training_sgd.yaml b/stack-showcase/configs/training.yaml → classifier-e2e/configs/training_sgd.yaml
@@ -5,18 +5,21 @@ settings:
       - sklearn
     requirements:
       - pyarrow
-      - huggingface_hub
 
 # configuration of the Model Control Plane
 model:
   name: breast_cancer_classifier
   license: Apache 2.0
-  description: Classification of Breast Cancer Dataset.
-  tags: ["classification", "sklearn"]
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier","sgd"]
+
+# Configure the pipeline
+parameters:
+  model_type: "sgd"  # Choose between xgboost/sgd
 
 steps:
   model_trainer:
     settings:
       step_operator.sagemaker:
         estimator_args: 
-          instance_type: "ml.m5.large"
+          instance_type : ml.m5.large
diff --git a/classifier-e2e/configs/training_sgd_sagemaker.yaml b/classifier-e2e/configs/training_sgd_sagemaker.yaml
@@ -0,0 +1,26 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier","sgd"]
+
+# Configure the pipeline
+parameters:
+  model_type: "sgd"  # Choose between rf/sgd
+
+steps:
+  model_trainer:
+    step_operator: sagemaker-eu
+    settings:
+      step_operator.sagemaker:
+        estimator_args: 
+          instance_type : ml.m5.large
diff --git a/classifier-e2e/configs/training_xgboost.yaml b/classifier-e2e/configs/training_xgboost.yaml
@@ -0,0 +1,26 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - xgboost
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier","xgboost"]
+
+# Configure the pipeline
+parameters:
+  model_type: "xgboost"  # Choose between sgd/xgboost
+
+steps:
+  model_trainer:
+    settings:
+      step_operator.sagemaker:
+        estimator_args: 
+          instance_type : ml.m5.large
diff --git a/classifier-e2e/configs/training_xgboost_sagemaker.yaml b/classifier-e2e/configs/training_xgboost_sagemaker.yaml
@@ -0,0 +1,27 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - xgboost
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier","xgboost"]
+
+# Configure the pipeline
+parameters:
+  model_type: "xgboost"  # Choose between sgd/xgboost
+
+steps:
+  model_trainer:
+    step_operator: sagemaker-eu
+    settings:
+      step_operator.sagemaker:
+        estimator_args: 
+          instance_type : ml.m5.large
diff --git a/classifier-e2e/pipelines/__init__.py b/classifier-e2e/pipelines/__init__.py
@@ -0,0 +1,21 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .feature_engineering import feature_engineering
+from .inference import inference
+from .training import training
+from .deploy import deploy
diff --git a/classifier-e2e/pipelines/deploy.py b/classifier-e2e/pipelines/deploy.py
@@ -0,0 +1,25 @@
+from zenml import pipeline, get_pipeline_context
+from steps import data_loader, inference_preprocessor
+import random
+from steps import deploy_endpoint, predict_on_endpoint, shutdown_endpoint
+
+
+@pipeline
+def deploy(shutdown_endpoint_after_predicting: bool = True):
+    # Get the preprocess pipeline artifact associated with this version
+    preprocess_pipeline = get_pipeline_context().model.get_artifact(
+        "preprocess_pipeline"
+    )
+
+    df_inference = data_loader(
+        random_state=random.randint(0, 1000), is_inference=True
+    )
+    df_inference = inference_preprocessor(
+        dataset_inf=df_inference,
+        preprocess_pipeline=preprocess_pipeline,
+        target="target",
+    )
+    predictor = deploy_endpoint()
+    predict_on_endpoint(predictor, df_inference)
+    if shutdown_endpoint_after_predicting:
+        shutdown_endpoint(predictor, after=["predict_on_endpoint"])
diff --git a/...showcase/pipelines/feature_engineering.py → ...fier-e2e/pipelines/feature_engineering.py b/...showcase/pipelines/feature_engineering.py → ...fier-e2e/pipelines/feature_engineering.py
@@ -1,13 +1,29 @@
-# {% include 'template/license_header' %}
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
-import random
 from typing import List, Optional
+import random
 
 from steps import (
     data_loader,
     data_preprocessor,
     data_splitter,
 )
+
 from zenml import pipeline
 from zenml.logger import get_logger
 
@@ -21,6 +37,7 @@ def feature_engineering(
     normalize: Optional[bool] = None,
     drop_columns: Optional[List[str]] = None,
     target: Optional[str] = "target",
+    random_state: int = None,
 ):
     """
     Feature engineering pipeline.
@@ -34,11 +51,16 @@ def feature_engineering(
         normalize: If `True` dataset will be normalized with MinMaxScaler
         drop_columns: List of columns to drop from dataset
         target: Name of target column in dataset
+        random_state: Random state to configure the data loader
+
+    Returns:
+        The processed datasets (dataset_trn, dataset_tst).
     """
-    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
     # Link all the steps together by calling them and passing the output
     # of one step as the input of the next step.
-    raw_data = data_loader(random_state=random.randint(0, 100), target=target)
+    if random_state is None:
+        random_state = random.randint(0,1000)
+    raw_data = data_loader(random_state=random_state, target=target)
     dataset_trn, dataset_tst = data_splitter(
         dataset=raw_data,
         test_size=test_size,
@@ -50,5 +72,6 @@ def feature_engineering(
         normalize=normalize,
         drop_columns=drop_columns,
         target=target,
+        random_state=random_state,
     )
     return dataset_trn, dataset_tst
diff --git a/classifier-e2e/pipelines/inference.py b/classifier-e2e/pipelines/inference.py
@@ -0,0 +1,62 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from steps import (
+    data_loader,
+    inference_predict,
+    inference_preprocessor,
+)
+
+from zenml import get_pipeline_context, pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def inference(random_state: str, target: str):
+    """
+    Model inference pipeline.
+
+    This is a pipeline that loads the inference data, processes it with
+    the same preprocessing pipeline used in training, and runs inference
+    with the trained model.
+
+    Args:
+        random_state: Random state for reproducibility.
+        target: Name of target column in dataset.
+    """
+    # Get the production model artifact
+    model = get_pipeline_context().model.get_artifact("breast_cancer_classifier")
+
+    # Get the preprocess pipeline artifact associated with this version
+    preprocess_pipeline = get_pipeline_context().model.get_artifact(
+        "preprocess_pipeline"
+    )
+
+    # Link all the steps together by calling them and passing the output
+    #  of one step as the input of the next step.
+    df_inference = data_loader(random_state=random_state, is_inference=True)
+    df_inference = inference_preprocessor(
+        dataset_inf=df_inference,
+        preprocess_pipeline=preprocess_pipeline,
+        target=target,
+    )
+    inference_predict(
+        model=model,
+        dataset_inf=df_inference,
+    )