From 03fa8505403889ab42b58450afbd6e5fdde9a547 Mon Sep 17 00:00:00 2001 From: votti Date: Thu, 9 Feb 2023 23:58:27 +0100 Subject: [PATCH 01/26] Adds a first draft of a kfpv1-metricscollector Closesly modelled after the tfevent-metricscollector. Currently not yet working, as there are issues that the arguments from the `injector_webhoook` are somehow not passed. Addresses: https://github.com/kubeflow/katib/issues/2019 --- .../v1beta1/kfpv1-metricscollector/Dockerfile | 24 ++++ .../v1beta1/kfpv1-metricscollector/main.py | 112 ++++++++++++++++++ .../kfpv1-metricscollector/requirements.txt | 5 + pkg/metricscollector/v1beta1/common/const.py | 3 + .../kfpv1-metricscollector/__init__.py | 0 .../kfpv1-metricscollector/metrics_loader.py | 94 +++++++++++++++ 6 files changed, 238 insertions(+) create mode 100644 cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile create mode 100644 cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py create mode 100644 cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt create mode 100644 pkg/metricscollector/v1beta1/kfpv1-metricscollector/__init__.py create mode 100644 pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile new file mode 100644 index 00000000000..4bd83564dc9 --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.9-slim + +ARG TARGETARCH +ENV TARGET_DIR /opt/katib +ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfpv1-metricscollector +ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfpv1-metricscollector/::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ + +ADD ./pkg/ ${TARGET_DIR}/pkg/ +ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/ + +WORKDIR ${TARGET_DIR}/${METRICS_COLLECTOR_DIR} + +RUN if [ "${TARGETARCH}" = "arm64" ]; then \ + apt-get -y update && \ + apt-get -y install gfortran libpcre3 libpcre3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +RUN pip install --no-cache-dir -r requirements.txt +RUN chgrp -R 0 ${TARGET_DIR} \ + && chmod -R g+rwX ${TARGET_DIR} + +ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py new file mode 100644 index 00000000000..ef8939e321b --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py @@ -0,0 +1,112 @@ +# Copyright 2022 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from logging import INFO, StreamHandler, getLogger + +import api_pb2 +import const +import grpc +from metrics_loader import MetricsCollector +from pns import WaitMainProcesses + +timeout_in_seconds = 60 + +# Next steps: +# +# - check is it is possible to mount the argo share +# - read the metrics from the tgz archive +# - +def parse_options(): + parser = argparse.ArgumentParser( + description="KFP V1 MetricsCollector", add_help=True + ) + + # TODO (andreyvelich): Add early stopping flags. + parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="") + parser.add_argument("-t", "--trial_name", type=str, default="") + parser.add_argument( + "-path", + "--metrics_file_dir", + type=str, + default=const.DEFAULT_METRICS_FILE_KFPV1_DIR, + ) + parser.add_argument("-m", "--metric_names", type=str, default="") + parser.add_argument("-o-type", "--objective_type", type=str, default="") + parser.add_argument("-f", "--metric_filters", type=str, default="") + parser.add_argument( + "-p", "--poll_interval", type=int, default=const.DEFAULT_POLL_INTERVAL + ) + parser.add_argument( + "-timeout", "--timeout", type=int, default=const.DEFAULT_TIMEOUT + ) + parser.add_argument( + "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES + ) + parser.add_argument( + "-fn", + "--metrics_file_name", + type=str, + default=const.DEFAULT_METRICS_FILE_KFPV1_FILE, + ) + + opt = parser.parse_args() + return opt + + +if __name__ == "__main__": + logger = getLogger(__name__) + handler = StreamHandler() + handler.setLevel(INFO) + logger.setLevel(INFO) + logger.addHandler(handler) + logger.propagate = False + opt = parse_options() + wait_all_processes = opt.wait_all_processes.lower() == "true" + db_manager_server = opt.db_manager_server_addr.split(":") + if len(db_manager_server) != 2: + raise Exception( + "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr + ) + + WaitMainProcesses( + pool_interval=opt.poll_interval, + timout=opt.timeout, + wait_all=wait_all_processes, + completed_marked_dir=None, + ) + + mc = MetricsCollector(opt.metric_names.split(";")) + metrics_file = os.path.join(opt.metrics_file_dir, opt.metrics_file_name) + observation_log = mc.parse_file(metrics_file) + + channel = grpc.beta.implementations.insecure_channel( + db_manager_server[0], int(db_manager_server[1]) + ) + + with api_pb2.beta_create_DBManager_stub(channel) as client: + logger.info( + "In " + + opt.trial_name + + " " + + str(len(observation_log.metric_logs)) + + " metrics will be reported." + ) + client.ReportObservationLog( + api_pb2.ReportObservationLogRequest( + trial_name=opt.trial_name, observation_log=observation_log + ), + timeout=timeout_in_seconds, + ) diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt new file mode 100644 index 00000000000..fa4fc7d22b9 --- /dev/null +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt @@ -0,0 +1,5 @@ +psutil==5.8.0 +rfc3339>=6.2 +grpcio==1.41.1 +googleapis-common-protos==1.6.0 +protobuf==3.20.0 diff --git a/pkg/metricscollector/v1beta1/common/const.py b/pkg/metricscollector/v1beta1/common/const.py index f3bdf56af46..1e5f4a103e8 100644 --- a/pkg/metricscollector/v1beta1/common/const.py +++ b/pkg/metricscollector/v1beta1/common/const.py @@ -20,6 +20,9 @@ DEFAULT_WAIT_ALL_PROCESSES = "True" # Default value for directory where TF event metrics are reported DEFAULT_METRICS_FILE_DIR = "/log" +# Default value for directory where TF event metrics are reported +DEFAULT_METRICS_FILE_KFPV1_DIR = "/tmp/outputs/mlpipeline_metrics" +DEFAULT_METRICS_FILE_KFPV1_FILE = "data" # Job finished marker in $$$$.pid file when main process is completed TRAINING_COMPLETED = "completed" diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/__init__.py b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py new file mode 100644 index 00000000000..8c159c77999 --- /dev/null +++ b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py @@ -0,0 +1,94 @@ +# Copyright 2022 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TFEventFileParser parses tfevent files and returns an ObservationLog of the metrics specified. +# When the event file is under a directory(e.g. test dir), please specify "{{dirname}}/{{metrics name}}" +# For example, in the Tensorflow MNIST Classification With Summaries: +# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py. +# The "accuracy" and "loss" metric is saved under "train" and "test" directories. +# So in the Metrics Collector specification, please specify name of "train" or "test" directory. +# Check TFJob example for more information: +# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22 + +from datetime import datetime +from logging import getLogger, StreamHandler, INFO +from typing import List +import json + +import rfc3339 +import api_pb2 +from pkg.metricscollector.v1beta1.common import const + + +def parse_metrics(fn: str) -> List[api_pb2.MetricLog]: + """Parse a kubeflow pipeline metrics file + + Args: + fn (function): path to metrics file + + Returns: + List[api_pb2.MetricLog]: A list of logged metrics + """ + metrics = [] + with open(fn, "r") as f: + metrics_dict = json.load(f) + for m in metrics_dict["metrics"]: + name = m["name"] + value = m["numberValue"] + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.now()), + metric=api_pb2.Metric(name=name, value=str(value)), + ) + metrics.append(ml) + return metrics + + +class MetricsCollector: + def __init__(self, metric_names): + self.logger = getLogger(__name__) + handler = StreamHandler() + handler.setLevel(INFO) + self.logger.setLevel(INFO) + self.logger.addHandler(handler) + self.logger.propagate = False + self.metrics = metric_names + + def parse_file(self, filename): + self.logger.info(filename + " will be parsed.") + mls = parse_metrics(filename) + + # Metrics logs must contain at least one objective metric value + # Objective metric is located at first index + is_objective_metric_reported = False + for ml in mls: + if ml.metric.name == self.metrics[0]: + is_objective_metric_reported = True + break + # If objective metrics were not reported, insert unavailable value in the DB + if not is_objective_metric_reported: + mls = [ + api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.now()), + metric=api_pb2.Metric( + name=self.metrics[0], value=const.UNAVAILABLE_METRIC_VALUE + ), + ) + ] + self.logger.info( + "Objective metric {} is not found in metrics file, {} value is reported".format( + self.metrics[0], const.UNAVAILABLE_METRIC_VALUE + ) + ) + + return api_pb2.ObservationLog(metric_logs=mls) From 891847341bd58831adfa38d98e6652b552756d36 Mon Sep 17 00:00:00 2001 From: votti Date: Fri, 10 Feb 2023 07:26:04 +0100 Subject: [PATCH 02/26] Use PodName as input The TrialName can be parse from the pod name. This seems currently a good way to get the trial name. For more discussion see: https://github.com/kubeflow/katib/issues/2109 --- .../v1beta1/kfpv1-metricscollector/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py index ef8939e321b..294fe68876f 100644 --- a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py @@ -36,7 +36,7 @@ def parse_options(): # TODO (andreyvelich): Add early stopping flags. parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="") - parser.add_argument("-t", "--trial_name", type=str, default="") + parser.add_argument("-t", "--pod_name", type=str, default="") parser.add_argument( "-path", "--metrics_file_dir", @@ -76,6 +76,7 @@ def parse_options(): opt = parse_options() wait_all_processes = opt.wait_all_processes.lower() == "true" db_manager_server = opt.db_manager_server_addr.split(":") + trial_name = '-'.join(opt.pod_name.split('-')[:-1]) if len(db_manager_server) != 2: raise Exception( "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr @@ -99,14 +100,14 @@ def parse_options(): with api_pb2.beta_create_DBManager_stub(channel) as client: logger.info( "In " - + opt.trial_name + + trial_name + " " + str(len(observation_log.metric_logs)) + " metrics will be reported." ) client.ReportObservationLog( api_pb2.ReportObservationLogRequest( - trial_name=opt.trial_name, observation_log=observation_log + trial_name=trial_name, observation_log=observation_log ), timeout=timeout_in_seconds, ) From fd53d8537c58512b1b4deacd1207bbc3de18405c Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 15 Feb 2023 12:12:19 +0100 Subject: [PATCH 03/26] Adds example for tuning a kfp v1 pipeline with Katib This example illustrates how a full kfp pipeline can be tuned using Katib. It is based on a metrics collector to collect kubeflow pipeline metrics (#2019). This is used as a Custom Collector. Addresses: #1914, #2019 --- examples/v1beta1/kubeflow-pipelines/README.md | 14 +- .../kubeflow-kfpv1-opt-mnist.ipynb | 1084 +++++++++++++++++ 2 files changed, 1095 insertions(+), 3 deletions(-) create mode 100644 examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md index df1e2bf0041..b6e53c21555 100644 --- a/examples/v1beta1/kubeflow-pipelines/README.md +++ b/examples/v1beta1/kubeflow-pipelines/README.md @@ -3,6 +3,10 @@ The following examples show how to use Katib with [Kubeflow Pipelines](https://github.com/kubeflow/pipelines). +Two different aspects are illustrated here: +A) How to orchestrate Katib experiments from Kubeflow pipelines using the Katib Kubeflow Component (Example 1 & 2) +B) How to use Katib to tune parameters of Kubeflow pipelines + You can find the Katib Component source code for the Kubeflow Pipelines [here](https://github.com/kubeflow/pipelines/tree/master/components/kubeflow/katib-launcher). @@ -13,6 +17,8 @@ You have to install the following Python SDK to run these examples: - [`kfp`](https://pypi.org/project/kfp/) >= 1.8.12 - [`kubeflow-katib`](https://pypi.org/project/kubeflow-katib/) >= 0.13.0 +In order to run parameter tuning over Kubeflow pipelines, additionally Katib needs to be setup to run with Argo workflow tasks. The setup is described within the example notebook (3). + ## Multi-User Pipelines Setup The Notebooks examples run Pipelines in multi-user mode and your Kubeflow Notebook @@ -25,10 +31,12 @@ to give an access Kubeflow Notebook to run Kubeflow Pipelines. The following Pipelines are deployed from Kubeflow Notebook: -- [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb) +1) [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb) + +2) [Katib Experiment with Early Stopping](early-stopping.ipynb) -- [Katib Experiment with Early Stopping](early-stopping.ipynb) +3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](pipeline-parameters.ipynb) -The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI: +The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2: - [MPIJob Horovod](mpi-job-horovod.py) diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb new file mode 100644 index 00000000000..cc16c6d528a --- /dev/null +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -0,0 +1,1084 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Katib parameter tuning over Kubeflow Pipelines (V1)\n", + "\n", + "This example shows how parameter tunning can be done over a multistep Kubeflow pipeline.\n", + "\n", + "The pipeline consists of 4 steps:\n", + "- Download of the training images and labels from the original MNIST publication\n", + "- Prepartion of the training dataset\n", + "- Image pre-processing\n", + "- Model fitting\n", + "\n", + "The pipeline has the model has model fitting parameters as well as image pre-processing parameters exposed as a pipeline parameter for tuning. Katib will be used to explore the question if image preprocessing using a simple histogram normalization might improve a neural network training on MNIST.\n", + "\n", + "## Requirements\n", + "\n", + "This requires a Kubeflow installation with Katib and Pipelines.\n", + "\n", + "Additionally the Katib-Argo integration needs to be setup:\n", + "\n", + "If you are running on a full Kubeflow installation *do not reinstall or update Argo* as this will likely break your installation.\n", + "\n", + "Just run the following commands:\n", + "\n", + "Enable side-car injection:\n", + "\n", + "`kubectl patch namespace argo -p '{\"metadata\":{\"labels\":{\"katib.kubeflow.org/metrics-collector-injection\":\"enabled\"}}}'`\n", + "\n", + "\n", + "Verify that the emissary executor is active (should be default in newer Kubeflow installations):\n", + "\n", + "` kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor`\n", + "\n", + "Patch the Katib controller:\n", + "\n", + "`kubectl patch ClusterRole katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/rules/-\", \"value\": {\"apiGroups\":[\"argoproj.io\"],\"resources\":[\"workflows\"],\"verbs\":[\"get\", \"list\", \"watch\", \"create\", \"delete\"]}}]'\n", + "`\n", + "\n", + "`kubectl patch Deployment katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args/-\", \"value\": \"--trial-resources=Workflow.v1alpha1.argoproj.io\"}]'`\n", + "\n", + "For more details and how to set this up on a partial Kubeflow installation follow:\n", + "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.mdd\n", + "If you are running on a full Kubeflow installation *DO NOT INSTALL ARGO* as this will likely break your installation.\n", + "\n", + "Just run the following commands:\n", + "\n", + "Enable side-car injection:\n", + "\n", + "`kubectl patch namespace argo -p '{\"metadata\":{\"labels\":{\"katib.kubeflow.org/metrics-collector-injection\":\"enabled\"}}}'`\n", + "\n", + "\n", + "Verify that the emissary executor is active (should be default in newer Kubeflow installations):\n", + "\n", + "` kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor`\n", + "\n", + "Patch the Katib controller:\n", + "\n", + "`kubectl patch ClusterRole katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/rules/-\", \"value\": {\"apiGroups\":[\"argoproj.io\"],\"resources\":[\"workflows\"],\"verbs\":[\"get\", \"list\", \"watch\", \"create\", \"delete\"]}}]'\n", + "`\n", + "\n", + "`kubectl patch Deployment katib-controller -n kubeflow --type=json \\\n", + " -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args/-\", \"value\": \"--trial-resources=Workflow.v1alpha1.argoproj.io\"}]'`\n", + "\n", + "For more details and how to set this up on a partial Kubeflow installation follow:\n", + "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.md\n", + "\n" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building the base Kubeflow pipeline\n", + "\n", + "The next steps will build up the following Kubeflow pipeline:\n", + "\n", + "![image.png](attachment:image.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set default variables\n", + "\n", + "The following default variables should be changed when running the notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Namespace to run the workloads under\n", + "USER_NAMESPACE = \"vito-zanotelli\"\n", + "# Pipeline service account\n", + "# On a Kubeflow instance on GCP this should be 'default-editor'\n", + "KFP_SERVICE_ACCOUNT = \"default-editor\"\n", + "\n", + "\n", + "# Consmetic variables\n", + "# Pipeline run variables\n", + "KFP_EXPERIMENT = \"katib-kfp-example\"\n", + "KFP_RUN = \"mnist-pipeline-v1\"\n", + "\n", + "# Katib run variables\n", + "KATIB_EXPERIMENT = \"katib-kfp-example-v1\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install and load required python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages (Kubeflow Pipelines and Katib SDK).\n", + "!pip install kfp==1.8.12\n", + "!pip install kubeflow-katib==0.13.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import kfp\n", + "import kfp.components as components\n", + "import kfp.dsl as dsl\n", + "from kfp.components import InputPath, OutputPath, create_component_from_func" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize the Kubeflow pipeline client\n", + "\n", + "Documentation how this is done in various environments: https://www.kubeflow.org/docs/components/pipelines/v1/sdk/connect-api/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kpf_client = kfp.Client()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the downloader component\n", + "\n", + "This is a publicly available, generic downloader we use to download the raw MNIST data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "download_data_op = components.load_component_from_url(\n", + " \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse the MNIST raw data format\n", + "\n", + "This is a component from text that converts the raw MNIST data format into a tensorflow compatible format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "parse_mnist_op = components.load_component_from_text(\n", + " \"\"\"\n", + "name: Parse MNIST\n", + "inputs:\n", + "- {name: Images, description: gziped images in the idx format}\n", + "- {name: Labels, description: gziped labels in the idx format}\n", + "outputs:\n", + "- {name: Dataset}\n", + "metadata:\n", + " annotations:\n", + " author: Vito Zanotelli, D-ONE.ai\n", + " description: Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml\n", + "implementation:\n", + " container:\n", + " image: tensorflow/tensorflow:2.7.1\n", + " command:\n", + " - sh\n", + " - -ec\n", + " - |\n", + " # This is how additional packages can be installed dynamically\n", + " python3 -m pip install pip idx2numpy\n", + " # Run the rest of the command after installing the packages.\n", + " \"$0\" \"$@\"\n", + " - python3\n", + " - -u # Auto-flush. We want the logs to appear in the console immediately.\n", + " - -c # Inline scripts are easy, but have size limitaions and the error traces do not show source lines.\n", + " - |\n", + " import gzip\n", + " import idx2numpy\n", + " import sys\n", + " from pathlib import Path\n", + " import pickle\n", + " import tensorflow as tf\n", + " img_path = sys.argv[1]\n", + " label_path = sys.argv[2]\n", + " output_path = sys.argv[3]\n", + " with gzip.open(img_path, 'rb') as f:\n", + " x = idx2numpy.convert_from_string(f.read())\n", + " with gzip.open(label_path, 'rb') as f:\n", + " y = idx2numpy.convert_from_string(f.read())\n", + " #one-hot encode the categories\n", + " x_out = tf.convert_to_tensor(x)\n", + " y_out = tf.keras.utils.to_categorical(y)\n", + " Path(output_path).parent.mkdir(parents=True, exist_ok=True)\n", + " with open(output_path, 'wb') as output_file:\n", + " pickle.dump((x_out, y_out), output_file)\n", + " - {inputPath: Images}\n", + " - {inputPath: Labels}\n", + " - {outputPath: Dataset}\n", + "\"\"\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process the images\n", + "\n", + "This does the pre-processing of the images, including a training-validation split.\n", + "\n", + "Here also an optional `histogram_norm` image normalization step can be activated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process(\n", + " data_raw_path: InputPath(str), # type: ignore\n", + " data_processed_path: OutputPath(str), # type: ignore\n", + " val_pct: float = 0.2,\n", + " trainset_flag: bool = True,\n", + " histogram_norm: bool = False,\n", + "):\n", + " \"\"\"\n", + " Here we do all the preprocessing\n", + " if the data path is for training data we:\n", + " (1) Normalize the data\n", + " (2) split the train and val data\n", + " If it is for unseen test data, we:\n", + " (1) Normalize the data\n", + " This function returns in any case the processed data path\n", + " \"\"\"\n", + " # sklearn\n", + " import pickle\n", + " from sklearn.model_selection import train_test_split\n", + " import tensorflow as tf\n", + " import tensorflow_addons as tfa\n", + "\n", + " def img_norm(x):\n", + " x_ = tf.reshape(x / 255, list(x.shape) + [1])\n", + "\n", + " if histogram_norm:\n", + " x_ = tfa.image.equalize(x_)\n", + " return x_\n", + "\n", + " with open(data_raw_path, \"rb\") as f:\n", + " x, y = pickle.load(f)\n", + " if trainset_flag:\n", + "\n", + " x_ = img_norm(x)\n", + " x_train, x_val, y_train, y_val = train_test_split(\n", + " x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\n", + " )\n", + "\n", + " with open(data_processed_path, \"wb\") as output_file:\n", + " pickle.dump((x_train, y_train, x_val, y_val), output_file)\n", + "\n", + " else:\n", + " x_ = img_norm(x)\n", + " with open(data_processed_path, \"wb\") as output_file:\n", + " pickle.dump((x_, y), output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "process_op = create_component_from_func(\n", + " func=process,\n", + " base_image=\"tensorflow/tensorflow:2.7.1\", # Optional\n", + " packages_to_install=[\"scikit-learn\", \"tensorflow-addons[tensorflow]\"], # Optional\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training component\n", + "\n", + "Component with ML hyperparameters as parameters.\n", + "Note that the `metrics` that should be tracked by Katib need to be\n", + "saved as ML metrics output artifacts.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train(\n", + " data_train_path: InputPath(str), # type: ignore\n", + " model_out_path: OutputPath(str), # type: ignore\n", + " mlpipeline_metrics_path: OutputPath(\"Metrics\"), # type: ignore # noqa: F821\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 1,\n", + " batch_size: int = 32,\n", + "):\n", + " \"\"\"\n", + " This is the simulated train part of our ML pipeline where training is performed\n", + " \"\"\"\n", + "\n", + " import tensorflow as tf\n", + " import pickle\n", + " from tensorflow.keras.preprocessing.image import ImageDataGenerator\n", + " import json\n", + "\n", + " with open(data_train_path, \"rb\") as f:\n", + " x_train, y_train, x_val, y_val = pickle.load(f)\n", + "\n", + " model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.Conv2D(\n", + " 64, (3, 3), activation=\"relu\", input_shape=(28, 28, 1)\n", + " ),\n", + " tf.keras.layers.MaxPooling2D(2, 2),\n", + " tf.keras.layers.Conv2D(64, (3, 3), activation=\"relu\"),\n", + " tf.keras.layers.MaxPooling2D(2, 2),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=\"relu\"),\n", + " tf.keras.layers.Dense(10, activation=\"softmax\"),\n", + " ]\n", + " )\n", + "\n", + " if optimizer.lower() == \"sgd\":\n", + " optimizer = tf.keras.optimizers.SGD(lr)\n", + " else:\n", + " optimizer = tf.keras.optimizers.Adam(lr)\n", + "\n", + " model.compile(loss=loss, optimizer=optimizer, metrics=[\"accuracy\"])\n", + "\n", + " # fit the model\n", + " model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\n", + " monitor=\"val_accuracy\", patience=10, verbose=1, restore_best_weights=True\n", + " )\n", + "\n", + " train_datagen = ImageDataGenerator()\n", + "\n", + " validation_datagen = ImageDataGenerator()\n", + " history = model.fit(\n", + " train_datagen.flow(x_train, y_train, batch_size=batch_size),\n", + " epochs=epochs,\n", + " validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\n", + " shuffle=False,\n", + " callbacks=[model_early_stopping_callback],\n", + " )\n", + "\n", + " model.save(model_out_path, save_format=\"tf\")\n", + "\n", + " metrics = {\n", + " \"metrics\": [\n", + " {\n", + " \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": history.history[\"accuracy\"][\n", + " -1\n", + " ], # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " {\n", + " \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": history.history[\"val_accuracy\"][\n", + " -1\n", + " ], # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " ]\n", + " }\n", + " with open(mlpipeline_metrics_path, \"w\") as f:\n", + " json.dump(metrics, f)\n", + "\n", + "\n", + "train_op = create_component_from_func(\n", + " func=train, base_image=\"tensorflow/tensorflow:2.7.1\", packages_to_install=[\"scipy\"]\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build the full pipeline\n", + "\n", + "These wires the components to a full pipeline.\n", + "\n", + "The only thing required to make the pipeline Katib compatible is:\n", + "\n", + "1) A pod label to mark the pod from which the metrics tracked by Katib should be collected from: \"katib.kubeflow.org/model-training\", \"true\"\n", + "2) A mark to prevent caching on this pod: `execution_options.caching_strategy.max_cache_staleness = \"P0D\"`\n", + "\n", + "In addition, currently the pod label for caching seems not be added by default and thus the cache is not used. To enable cache usage, the cache label is added to all the steps.\n", + "\n", + "Apart from these two requirements, there is no restriction on how the pipeline is build. The pipeline remains a normal Kubeflow pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def _label_cache(step):\n", + " \"\"\"Helper to add pod cache label\n", + "\n", + " Currently there seems to be an issue with pod labeling.\n", + " \"\"\"\n", + " step.add_pod_label(\"pipelines.kubeflow.org/cache_enabled\", \"true\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Download MNIST dataset\",\n", + " description=\"A pipeline to download the MNIST dataset files\",\n", + ")\n", + "def mnist_training_pipeline(\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 3,\n", + " batch_size: int = 5,\n", + " histogram_norm: bool = False,\n", + "):\n", + " TRAIN_IMG_URL = \"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\"\n", + " TRAIN_LAB_URL = \"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\"\n", + "\n", + " train_imgs = download_data_op(TRAIN_IMG_URL)\n", + " train_imgs.set_display_name(\"Download training images\")\n", + " _label_cache(train_imgs)\n", + "\n", + " train_y = download_data_op(TRAIN_LAB_URL)\n", + " train_y.set_display_name(\"Download training labels\")\n", + " _label_cache(train_y)\n", + "\n", + " mnist_train = parse_mnist_op(train_imgs.output, train_y.output)\n", + " mnist_train.set_display_name(\"Prepare train dataset\")\n", + " _label_cache(mnist_train)\n", + "\n", + " processed_train = (\n", + " process_op(\n", + " mnist_train.output,\n", + " val_pct=0.2,\n", + " trainset_flag=True,\n", + " histogram_norm=histogram_norm,\n", + " )\n", + " .set_cpu_limit(\"1\")\n", + " .set_memory_limit(\"2Gi\")\n", + " .set_display_name(\"Preprocess images\")\n", + " )\n", + " _label_cache(processed_train)\n", + "\n", + " training_output = (\n", + " train_op(\n", + " processed_train.outputs[\"data_processed\"],\n", + " lr=lr,\n", + " optimizer=optimizer,\n", + " epochs=epochs,\n", + " batch_size=batch_size,\n", + " loss=loss,\n", + " )\n", + " .set_cpu_limit(\"1\")\n", + " .set_memory_limit(\"2Gi\")\n", + " )\n", + " training_output.set_display_name(\"Fit the model\")\n", + " # This pod label indicates which pod Katib should collect the metric from.\n", + " # A metrics collecting sidecar container will be added\n", + " training_output.add_pod_label(\"katib.kubeflow.org/model-training\", \"true\")\n", + " # This step needs to run always, as otherwise the metrics for Katib could not\n", + " # be collected.\n", + " training_output.execution_options.caching_strategy.max_cache_staleness = \"P0D\"\n", + "\n", + " return mnist_train.output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = kfp_client.create_run_from_pipeline_func(\n", + " mnist_training_pipeline,\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " # You can optionally override your pipeline_root when submitting the run too:\n", + " # pipeline_root='gs://my-pipeline-root/example-pipeline',\n", + " arguments={\"histogram_norm\": \"0\"},\n", + " experiment_name=KFP_EXPERIMENT,\n", + " run_name=KFP_RUN,\n", + " namespace=USER_NAMESPACE,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parameter tuning with Katib\n", + "\n", + "We now want to do parameter tuning over the whole pipeline with Katib.\n", + "\n", + "This requires us to build up a specificaiton for the Katib experiment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First import the Katib python components:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "from typing import List\n", + "\n", + "from kubernetes.client.models import V1ObjectMeta\n", + "from kubeflow.katib import ApiClient\n", + "from kubeflow.katib import KatibClient\n", + "from kubeflow.katib import V1beta1Experiment\n", + "from kubeflow.katib import V1beta1ExperimentSpec\n", + "from kubeflow.katib import V1beta1AlgorithmSpec\n", + "from kubeflow.katib import V1beta1ObjectiveSpec\n", + "from kubeflow.katib import V1beta1ParameterSpec\n", + "from kubeflow.katib import V1beta1FeasibleSpace\n", + "from kubeflow.katib import V1beta1TrialTemplate\n", + "from kubeflow.katib import V1beta1TrialParameterSpec\n", + "from kubeflow.katib import V1beta1MetricsCollectorSpec\n", + "from kubeflow.katib import V1beta1CollectorSpec" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to build a katib experiment, we require a trial spec.\n", + "\n", + "In this case the trial spec is an Argo workflow produced form the Kubeflow pipeline.\n", + "\n", + "This workflow can be run thanks to the Katib-Argo integration that was setup in the requirements section.\n", + "\n", + "\n", + "The Katib Experiment consists of many components, that we next will setup using custom built helper functions:" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Helper functions to build the individual Katib Experiment Components\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_trial_spec(\n", + " pipeline, params_list: List[dsl.PipelineParam], service_account: str | None = None\n", + "):\n", + " \"\"\"\n", + " Create an Argo workflow specification from a KFP pipeline function\n", + "\n", + " The Argo worklow CRD will be the basis for the trial_template used\n", + " by Katib.\n", + "\n", + " Args:\n", + " pipeline: a kubeflow pipeline function\n", + " params_list (List[dsl.PipelineParam]): a list of mappings of Kubeflow pipeline parameters\n", + " to Katib trialParameters.\n", + " These need to map the pipeline parameter to the Katib parameter.\n", + " Eg: [dsl.PipelineParam(name='lr', value='${trialParameters.learningRate}')]\n", + " here `lr` is the PipelineParam and `trialParameters.learningRate` the Katib trialParameter.\n", + "\n", + " \"\"\"\n", + " compiler = kfp.compiler.Compiler(\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " )\n", + " # Here the pipeline parameters are passed.\n", + " # These will be generated in the Katib trials\n", + " trial_spec = compiler._create_workflow(pipeline, params_list=params_list)\n", + " # Somehow the pipeline is configured with the wrong serviceAccountName by default\n", + " if service_account is not None:\n", + " trial_spec[\"spec\"][\"serviceAccountName\"] = service_account\n", + "\n", + " return trial_spec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_trial_template(\n", + " trial_spec,\n", + " trial_param_specs: List[V1beta1TrialParameterSpec],\n", + " retain_pods: bool = False,\n", + ") -> V1beta1TrialTemplate:\n", + " \"\"\"Generate a trial template from the spec\n", + "\n", + " This takes the Argo workflow CRD and wrapps it as a\n", + " Katib trial template.\n", + " Here the Katib trial parameters are defined.\n", + "\n", + " Args:\n", + " trial_spec (Argo workflow spec): The workflow/pipeline to tune\n", + " trial_params_spec (List[V1beta1TrialParameterSpec]): The trial parameter specifications\n", + " Note that the `name` of the parameters needs to match the names refered to by the\n", + " create_trial_spec `params_list` arguments.\n", + " The `ref` needs to match the names used in the parameter space defined in `V1beta1ParameterSpec`.\n", + "\n", + " Returns:\n", + " V1beta1TrialTemplate: the trial template\n", + " \"\"\"\n", + "\n", + " trial_template = V1beta1TrialTemplate(\n", + " primary_container_name=\"main\", # Name of the primary container returning the metrics in the workflow\n", + " # The label used for the pipeline component returning the pipeline specs\n", + " primary_pod_labels={\"katib.kubeflow.org/model-training\": \"true\"},\n", + " trial_parameters=trial_param_specs,\n", + " trial_spec=trial_spec,\n", + " success_condition='status.[@this].#(phase==\"Succeeded\")#',\n", + " failure_condition='status.[@this].#(phase==\"Failed\")#',\n", + " retain=retain_pods, # Retain completed pods - left hear for easier debugging\n", + " )\n", + " return trial_template" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_metrics_collector_spec(objective: V1beta1ObjectiveSpec):\n", + " \"\"\"This defines the custom metrics collector\n", + "\n", + " This custom metrics connector was built to collect\n", + " Kubeflow pipeline MLmetrics from a step.\n", + "\n", + " Args:\n", + " objective (V1beta1ObjectiveSpec): the objective spec used to get the metrics names\n", + "\n", + " \"\"\"\n", + "\n", + " metric_names = [objective.objective_metric_name] + list(\n", + " objective.additional_metric_names\n", + " )\n", + " collector = V1beta1MetricsCollectorSpec(\n", + " source={\n", + " \"fileSystemPath\": {\n", + " # In KFP v1 this seems to be the hardcoded location\n", + " # for this output file..\n", + " \"path\": \"/tmp/outputs/mlpipeline_metrics/data\",\n", + " \"kind\": \"File\",\n", + " }\n", + " },\n", + " collector=V1beta1CollectorSpec(\n", + " kind=\"Custom\",\n", + " custom_collector={\n", + " \"args\": [\n", + " \"-m\",\n", + " f\"{';'.join(metric_names)}\",\n", + " \"-s\",\n", + " \"katib-db-manager.kubeflow:6789\",\n", + " \"-t\",\n", + " \"$(PodName)\",\n", + " \"-path\",\n", + " \"/tmp/outputs/mlpipeline_metrics\",\n", + " ],\n", + " \"image\": \"votti/kfpv1-metricscollector:v0.0.10\",\n", + " \"imagePullPolicy\": \"Always\",\n", + " \"name\": \"custom-metrics-logger-and-collector\",\n", + " \"env\": [\n", + " {\n", + " # In this setup the PodName can be used to\n", + " # infer the `trial name` required to report back\n", + " # the metrics.\n", + " \"name\": \"PodName\",\n", + " \"valueFrom\": {\"fieldRef\": {\"fieldPath\": \"metadata.name\"}},\n", + " }\n", + " ],\n", + " },\n", + " ),\n", + " )\n", + " return collector" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Final helper function to create experiments from pipelines\n", + "\n", + "\n", + "This helper function is the main entry point to train pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_katib_experiment_spec(\n", + " pipeline: dsl.Pipeline,\n", + " pipeline_params: List[dsl.PipelineParam],\n", + " trial_params: List[V1beta1TrialParameterSpec],\n", + " trial_params_space: List[V1beta1ParameterSpec],\n", + " objective: V1beta1ObjectiveSpec,\n", + " algorithm: V1beta1AlgorithmSpec,\n", + " max_trial_count: int = 2,\n", + " max_failed_trial_count: int = 2,\n", + " parallel_trial_count: int = 2,\n", + " pipeline_service_account: str | None = None,\n", + " retain_pods: bool = False,\n", + ") -> V1beta1ExperimentSpec:\n", + " \"\"\"Construct a Katib experiment over a KFP pipeline\n", + "\n", + " Args:\n", + " pipeline (dsl.Pipeline): The Kubeflow Pipeline\n", + " pipeline_params (List[dsl.PipelineParam]): A mapping of trial-parameters to pipeline parameters.\n", + " Example: [\n", + " dsl.PipelineParam(name=\"lr\", value=\"${trialParameters.learningRate}\"),\n", + " ...\n", + " ]\n", + " trial_params (List[V1beta1TrialParameterSpec]): Spec for Trial parameters. Note that name\n", + " and refs need to match the ones used in `pipeline_params` and `trial_params_space`\n", + " Example: [\n", + " V1beta1TrialParameterSpec(\n", + " name=\"learningRate\",\n", + " description=\"Learning rate for the training model\",\n", + " reference=\"learning_rate\",\n", + " ), ...]\n", + " trial_params_space (List[V1beta1ParameterSpec]): The spec for the parameter space explored in the\n", + " Trials\n", + " Example: [\n", + " V1beta1ParameterSpec(\n", + " name=\"learning_rate\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"0.00001\", max=\"0.001\"),\n", + " ), ...]\n", + " objective (V1beta1ObjectiveSpec): objective spec. The names used here\n", + " need to match the metrics reported by the pipeline.\n", + " Example: V1beta1ObjectiveSpec(\n", + " type=\"maximize\",\n", + " goal=0.9,\n", + " objective_metric_name=\"val-accuracy\",\n", + " additional_metric_names=[\"accuracy\"],\n", + " )\n", + " algorithm (V1beta1AlgorithmSpec): algorithm spec\n", + " Example: V1beta1AlgorithmSpec(\n", + " algorithm_name=\"random\",\n", + " )\n", + " max_trial_count (int, optional): Max total number of trials. Defaults to 2.\n", + " max_failed_trial_count (int, optional): Number of failed trials tolerated. Defaults to 2.\n", + " parallel_trial_count (int, optional): Number of trials run in parallel. Defaults to 2.\n", + " pipeline_service_account (str | None, optional): Name of the service account to run\n", + " pipelines with. Defaults to None (uses pre-configured default).\n", + " On a Kubeflow GCP deployment this should be set to `default-editor`\n", + " retain_pods (bool): retain pods (good for debugging). Default: false\n", + "\n", + " Returns:\n", + " V1beta1ExperimentSpec: Katib experiment spec\n", + " \"\"\"\n", + "\n", + " trial_spec = create_trial_spec(\n", + " pipeline, pipeline_params, service_account=pipeline_service_account\n", + " )\n", + "\n", + " # Configure parameters for the Trial template.\n", + " trial_template = create_trial_template(\n", + " trial_spec, trial_params, retain_pods=retain_pods\n", + " )\n", + "\n", + " # Metrics collector spec\n", + " metrics_collector = create_metrics_collector_spec(objective=objective)\n", + "\n", + " # Create an Experiment from the above parameters.\n", + " experiment_spec = V1beta1ExperimentSpec(\n", + " # Experimental Budget\n", + " max_trial_count=max_trial_count,\n", + " max_failed_trial_count=max_failed_trial_count,\n", + " parallel_trial_count=parallel_trial_count,\n", + " # Optimization Objective\n", + " objective=objective,\n", + " # Optimization Algorithm\n", + " algorithm=algorithm,\n", + " # Optimization Parameters\n", + " parameters=trial_params_space,\n", + " # Trial Template\n", + " trial_template=trial_template,\n", + " # Metrics collector\n", + " metrics_collector_spec=metrics_collector,\n", + " )\n", + "\n", + " return experiment_spec" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tune the MNIST pipeline using Katib\n", + "\n", + "First prepare all required input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_params = [\n", + " dsl.PipelineParam(name=\"lr\", value=\"${trialParameters.learningRate}\"),\n", + " dsl.PipelineParam(name=\"batch_size\", value=\"${trialParameters.batchSize}\"),\n", + " dsl.PipelineParam(name=\"histogram_norm\", value=\"${trialParameters.histogramNorm}\"),\n", + "]\n", + "trial_params_specs = [\n", + " V1beta1TrialParameterSpec(\n", + " name=\"learningRate\", # the parameter name that is replaced in your template (see Trial Specification).\n", + " description=\"Learning rate for the training model\",\n", + " reference=\"learning_rate\", # the parameter name that experiment’s suggestion returns (parameter name in the Parameters Specification).\n", + " ),\n", + " V1beta1TrialParameterSpec(\n", + " name=\"batchSize\",\n", + " description=\"Batch size for NN training\",\n", + " reference=\"batch_size\",\n", + " ),\n", + " V1beta1TrialParameterSpec(\n", + " name=\"histogramNorm\",\n", + " description=\"Histogram normalization of image on?\",\n", + " reference=\"histogram_norm\",\n", + " ),\n", + "]\n", + "parameter_space = [\n", + " V1beta1ParameterSpec(\n", + " name=\"learning_rate\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"0.00001\", max=\"0.001\"),\n", + " ),\n", + " V1beta1ParameterSpec(\n", + " name=\"batch_size\",\n", + " parameter_type=\"int\",\n", + " feasible_space=V1beta1FeasibleSpace(min=\"16\", max=\"64\"),\n", + " ),\n", + " V1beta1ParameterSpec(\n", + " name=\"histogram_norm\",\n", + " parameter_type=\"discrete\",\n", + " feasible_space=V1beta1FeasibleSpace(list=[\"0\", \"1\"]),\n", + " ),\n", + "]\n", + "objective = V1beta1ObjectiveSpec(\n", + " type=\"maximize\",\n", + " goal=0.9,\n", + " objective_metric_name=\"val-accuracy\",\n", + " additional_metric_names=[\"accuracy\"],\n", + ")\n", + "\n", + "algorithm = V1beta1AlgorithmSpec(\n", + " algorithm_name=\"random\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the full spec\n", + "\n", + "katib_spec = create_katib_experiment_spec(\n", + " pipeline=mnist_training_pipeline,\n", + " pipeline_params=pipeline_params,\n", + " trial_params=trial_params_specs,\n", + " trial_params_space=parameter_space,\n", + " objective=objective,\n", + " algorithm=algorithm,\n", + " pipeline_service_account=KFP_SERVICE_ACCOUNT,\n", + " max_trial_count=5,\n", + " parallel_trial_count=5,\n", + " retain_pods=False,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to generate a full experiment the api_version, kind and namespace need to be defined:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "katib_experiment = V1beta1Experiment(\n", + " api_version=\"kubeflow.org/v1beta1\",\n", + " kind=\"Experiment\",\n", + " metadata=V1ObjectMeta(\n", + " name=KATIB_EXPERIMENT,\n", + " namespace=USER_NAMESPACE,\n", + " ),\n", + " spec=katib_spec,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generated yaml can written out to submit via the web ui:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"experiment_template_kfp_mnist_v1.yaml\", \"w\") as f:\n", + " yaml.dump(ApiClient().sanitize_for_serialization(katib_experiment), f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or sumitted via the KatibClient:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "katib_client = KatibClient()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "katib_client.create_experiment(katib_experiment)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should now be able to observe in the Web UI how the Katib\n", + "Experiment is running.\n", + "\n", + "To see how the `Argo Workflows` are started, you can also check the Kubernetes cluster:\n", + "\n", + "`kubectl get Workflow -n `" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "katib-exp", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "346a4e9d8b8e6802b68a0916b92683cfb1882082eeafaaae0a3525ab995e1047" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e9a005191fe9596158d35ba71bf759e5d537ffb9 Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 15 Feb 2023 13:09:15 +0100 Subject: [PATCH 04/26] Adds python < 3.11 compatiblity Before the notebook only worked with Python 3.11. Now it is also tested with 3.10 Also the experiment/run name is extended with a timestamp for easier reruns. --- .../kubeflow-kfpv1-opt-mnist.ipynb | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb index cc16c6d528a..808c9920329 100644 --- a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -147,6 +147,8 @@ "metadata": {}, "outputs": [], "source": [ + "from typing import Optional\n", + "from datetime import datetime as dt\n", "import kfp\n", "import kfp.components as components\n", "import kfp.dsl as dsl\n", @@ -169,7 +171,7 @@ "metadata": {}, "outputs": [], "source": [ - "kpf_client = kfp.Client()" + "kfp_client = kfp.Client()" ] }, { @@ -554,6 +556,7 @@ "metadata": {}, "outputs": [], "source": [ + "kfp_run = f\"{KFP_RUN}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", "run = kfp_client.create_run_from_pipeline_func(\n", " mnist_training_pipeline,\n", " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", @@ -561,7 +564,7 @@ " # pipeline_root='gs://my-pipeline-root/example-pipeline',\n", " arguments={\"histogram_norm\": \"0\"},\n", " experiment_name=KFP_EXPERIMENT,\n", - " run_name=KFP_RUN,\n", + " run_name=kfp_run,\n", " namespace=USER_NAMESPACE,\n", ")" ] @@ -640,7 +643,9 @@ "outputs": [], "source": [ "def create_trial_spec(\n", - " pipeline, params_list: List[dsl.PipelineParam], service_account: str | None = None\n", + " pipeline,\n", + " params_list: List[dsl.PipelineParam],\n", + " service_account: Optional[str] = None,\n", "):\n", " \"\"\"\n", " Create an Argo workflow specification from a KFP pipeline function\n", @@ -798,7 +803,7 @@ " max_trial_count: int = 2,\n", " max_failed_trial_count: int = 2,\n", " parallel_trial_count: int = 2,\n", - " pipeline_service_account: str | None = None,\n", + " pipeline_service_account: Optional[str] = None,\n", " retain_pods: bool = False,\n", ") -> V1beta1ExperimentSpec:\n", " \"\"\"Construct a Katib experiment over a KFP pipeline\n", @@ -986,11 +991,14 @@ "metadata": {}, "outputs": [], "source": [ + "katib_experiment_name = (\n", + " f\"{KATIB_EXPERIMENT}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + ")\n", "katib_experiment = V1beta1Experiment(\n", " api_version=\"kubeflow.org/v1beta1\",\n", " kind=\"Experiment\",\n", " metadata=V1ObjectMeta(\n", - " name=KATIB_EXPERIMENT,\n", + " name=katib_experiment_name,\n", " namespace=USER_NAMESPACE,\n", " ),\n", " spec=katib_spec,\n", @@ -1011,7 +1019,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"experiment_template_kfp_mnist_v1.yaml\", \"w\") as f:\n", + "with open(f\"{KATIB_EXPERIMENT}.yaml\", \"w\") as f:\n", " yaml.dump(ApiClient().sanitize_for_serialization(katib_experiment), f)" ] }, @@ -1071,7 +1079,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.16" }, "vscode": { "interpreter": { From 17123d6617e2d54eb3e98a6c83e6a87dc477daab Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 15 Feb 2023 13:47:17 +0100 Subject: [PATCH 05/26] Add histogram equalization before rescaling Otherwise the image was binarized, leading to an artifically bad performance. --- .../kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb index 808c9920329..459d2d3f53b 100644 --- a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -305,10 +305,13 @@ " import tensorflow_addons as tfa\n", "\n", " def img_norm(x):\n", - " x_ = tf.reshape(x / 255, list(x.shape) + [1])\n", + " x_ = tf.reshape(x, list(x.shape) + [1])\n", "\n", " if histogram_norm:\n", " x_ = tfa.image.equalize(x_)\n", + "\n", + " # Scale between 0-1\n", + " x_ = x_ / 255\n", " return x_\n", "\n", " with open(data_raw_path, \"rb\") as f:\n", From 4f19db8d453bc72e130793908ef364eabd1f857c Mon Sep 17 00:00:00 2001 From: votti Date: Thu, 16 Mar 2023 08:53:24 +0100 Subject: [PATCH 06/26] Update copyright date And remove an old comment --- .../v1beta1/kfpv1-metricscollector/main.py | 10 +++------- .../v1beta1/kfpv1-metricscollector/metrics_loader.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py index 294fe68876f..900682b9eab 100644 --- a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py @@ -1,4 +1,4 @@ -# Copyright 2022 The Kubeflow Authors. +# Copyright 2023 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,11 +24,7 @@ timeout_in_seconds = 60 -# Next steps: -# -# - check is it is possible to mount the argo share -# - read the metrics from the tgz archive -# - + def parse_options(): parser = argparse.ArgumentParser( description="KFP V1 MetricsCollector", add_help=True @@ -76,7 +72,7 @@ def parse_options(): opt = parse_options() wait_all_processes = opt.wait_all_processes.lower() == "true" db_manager_server = opt.db_manager_server_addr.split(":") - trial_name = '-'.join(opt.pod_name.split('-')[:-1]) + trial_name = "-".join(opt.pod_name.split("-")[:-1]) if len(db_manager_server) != 2: raise Exception( "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py index 8c159c77999..ce6bbb41ed3 100644 --- a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py +++ b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py @@ -1,4 +1,4 @@ -# Copyright 2022 The Kubeflow Authors. +# Copyright 2023 The Kubeflow Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ def parse_metrics(fn: str) -> List[api_pb2.MetricLog]: List[api_pb2.MetricLog]: A list of logged metrics """ metrics = [] - with open(fn, "r") as f: + with open(fn) as f: metrics_dict = json.load(f) for m in metrics_dict["metrics"]: name = m["name"] From 9f83b0fc24c7cb2a635d08d5e5d824992b4fb8d2 Mon Sep 17 00:00:00 2001 From: votti Date: Thu, 16 Mar 2023 08:53:50 +0100 Subject: [PATCH 07/26] Update python version --- cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile index 4bd83564dc9..21771e8dccf 100644 --- a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile +++ b/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim +FROM python:3.10-slim ARG TARGETARCH ENV TARGET_DIR /opt/katib From 61e77eac705862968ea04e274cad370e606d103c Mon Sep 17 00:00:00 2001 From: votti Date: Thu, 16 Mar 2023 09:05:06 +0100 Subject: [PATCH 08/26] Publish the docker image in kubeflowkatib --- .github/workflows/publish-core-images.yaml | 2 ++ .../v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb | 2 +- scripts/v1beta1/build.sh | 3 +++ scripts/v1beta1/push.sh | 3 +++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 750ab03c99e..ebc6a37ad2f 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -32,3 +32,5 @@ jobs: dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile - component-name: tfevent-metrics-collector dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile + - component-name: kfpv1-metrics-collector + dockerfile: cmd/metricscollector/v1beta1/kvpv1-metricscollector/Dockerfile diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb index 459d2d3f53b..801108b68b1 100644 --- a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -761,7 +761,7 @@ " \"-path\",\n", " \"/tmp/outputs/mlpipeline_metrics\",\n", " ],\n", - " \"image\": \"votti/kfpv1-metricscollector:v0.0.10\",\n", + " \"image\": \"docker.io/kubeflowkatib/kfpv1-metrics-collector:latest\",\n", " \"imagePullPolicy\": \"Always\",\n", " \"name\": \"custom-metrics-logger-and-collector\",\n", " \"env\": [\n", diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 3953f49f54d..43952074ae6 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -71,6 +71,9 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/cert-generator:${ echo -e "\nBuilding file metrics collector image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +echo -e "\nBuilding kfpv1 metrics collector image...\n" +docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfpv1-metricscollector/Dockerfile . + echo -e "\nBuilding TF Event metrics collector image...\n" if [ "${ARCH}" == "ppc64le" ]; then docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 6f0627b4081..d8c7116552f 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -50,6 +50,9 @@ docker push "${REGISTRY}/cert-generator:${TAG}" echo -e "\nPushing file metrics collector image...\n" docker push "${REGISTRY}/file-metrics-collector:${TAG}" +echo -e "\nPushing kfpv1 metrics collector image...\n" +docker push "${REGISTRY}/kfpv1-metrics-collector:${TAG}" + echo -e "\nPushing TF Event metrics collector image...\n" docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}" From 88c20c35ed1855dfe62c77842b905b111a0ca3e0 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Wed, 21 Jun 2023 13:55:45 +0200 Subject: [PATCH 09/26] Fix suggested typo fixes Co-authored-by: axel7083 <42176370+axel7083@users.noreply.github.com> --- .github/workflows/publish-core-images.yaml | 2 +- examples/v1beta1/kubeflow-pipelines/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index ebc6a37ad2f..7b5bea240f1 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -33,4 +33,4 @@ jobs: - component-name: tfevent-metrics-collector dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile - component-name: kfpv1-metrics-collector - dockerfile: cmd/metricscollector/v1beta1/kvpv1-metricscollector/Dockerfile + dockerfile: cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md index b6e53c21555..0c2ff8b6956 100644 --- a/examples/v1beta1/kubeflow-pipelines/README.md +++ b/examples/v1beta1/kubeflow-pipelines/README.md @@ -35,7 +35,7 @@ The following Pipelines are deployed from Kubeflow Notebook: 2) [Katib Experiment with Early Stopping](early-stopping.ipynb) -3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](pipeline-parameters.ipynb) +3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](kubeflow-kfpv1-opt-mnist.ipynb) The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2: From 904d07d36839b34ecc98eeb118e8dfb5c906a2a5 Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 21 Jun 2023 14:40:47 +0200 Subject: [PATCH 10/26] Move KFP V1 metrics collector docker files to v1 subfolder As per suggestion --- .github/workflows/publish-core-images.yaml | 2 +- .../v1}/Dockerfile | 0 .../{kfpv1-metricscollector => kfp-metricscollector/v1}/main.py | 0 .../v1}/requirements.txt | 0 scripts/v1beta1/build.sh | 2 +- 5 files changed, 2 insertions(+), 2 deletions(-) rename cmd/metricscollector/v1beta1/{kfpv1-metricscollector => kfp-metricscollector/v1}/Dockerfile (100%) rename cmd/metricscollector/v1beta1/{kfpv1-metricscollector => kfp-metricscollector/v1}/main.py (100%) rename cmd/metricscollector/v1beta1/{kfpv1-metricscollector => kfp-metricscollector/v1}/requirements.txt (100%) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 7b5bea240f1..5708e9ce9ac 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -33,4 +33,4 @@ jobs: - component-name: tfevent-metrics-collector dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile - component-name: kfpv1-metrics-collector - dockerfile: cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile + dockerfile: cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile similarity index 100% rename from cmd/metricscollector/v1beta1/kfpv1-metricscollector/Dockerfile rename to cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py similarity index 100% rename from cmd/metricscollector/v1beta1/kfpv1-metricscollector/main.py rename to cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py diff --git a/cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt similarity index 100% rename from cmd/metricscollector/v1beta1/kfpv1-metricscollector/requirements.txt rename to cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 43952074ae6..0fc1dd167d4 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -72,7 +72,7 @@ echo -e "\nBuilding file metrics collector image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding kfpv1 metrics collector image...\n" -docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfpv1-metricscollector/Dockerfile . +docker buildx build --platform "linux/${ARCH}" self, -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" if [ "${ARCH}" == "ppc64le" ]; then From 31655ddcccee5c10c1c4bf0d524cffb726a800f7 Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 21 Jun 2023 14:52:28 +0200 Subject: [PATCH 11/26] Support loading of folder of metrics collector files As suggested in the PR review, the generic case where multiple KFP pipeline metrics files would be present in the output folder is supported. Note that in the current KFP v1 implementation always only one data file is present. --- .../v1beta1/kfp-metricscollector/v1/main.py | 10 +-- pkg/metricscollector/v1beta1/common/const.py | 3 +- .../kfpv1-metricscollector/metrics_loader.py | 76 +++++++++++-------- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py index 900682b9eab..333e70553eb 100644 --- a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py @@ -51,13 +51,6 @@ def parse_options(): parser.add_argument( "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES ) - parser.add_argument( - "-fn", - "--metrics_file_name", - type=str, - default=const.DEFAULT_METRICS_FILE_KFPV1_FILE, - ) - opt = parser.parse_args() return opt @@ -86,8 +79,7 @@ def parse_options(): ) mc = MetricsCollector(opt.metric_names.split(";")) - metrics_file = os.path.join(opt.metrics_file_dir, opt.metrics_file_name) - observation_log = mc.parse_file(metrics_file) + observation_log = mc.parse_file(opt.metrics_file_dir) channel = grpc.beta.implementations.insecure_channel( db_manager_server[0], int(db_manager_server[1]) diff --git a/pkg/metricscollector/v1beta1/common/const.py b/pkg/metricscollector/v1beta1/common/const.py index 1e5f4a103e8..c155cd04945 100644 --- a/pkg/metricscollector/v1beta1/common/const.py +++ b/pkg/metricscollector/v1beta1/common/const.py @@ -20,9 +20,8 @@ DEFAULT_WAIT_ALL_PROCESSES = "True" # Default value for directory where TF event metrics are reported DEFAULT_METRICS_FILE_DIR = "/log" -# Default value for directory where TF event metrics are reported +# Default value for directory where Kubeflow pipeline metrics are reported DEFAULT_METRICS_FILE_KFPV1_DIR = "/tmp/outputs/mlpipeline_metrics" -DEFAULT_METRICS_FILE_KFPV1_FILE = "data" # Job finished marker in $$$$.pid file when main process is completed TRAINING_COMPLETED = "completed" diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py index ce6bbb41ed3..74e47d6a558 100644 --- a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py +++ b/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py @@ -12,17 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TFEventFileParser parses tfevent files and returns an ObservationLog of the metrics specified. -# When the event file is under a directory(e.g. test dir), please specify "{{dirname}}/{{metrics name}}" -# For example, in the Tensorflow MNIST Classification With Summaries: -# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py. -# The "accuracy" and "loss" metric is saved under "train" and "test" directories. -# So in the Metrics Collector specification, please specify name of "train" or "test" directory. -# Check TFJob example for more information: -# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22 +# The Kubeflow pipeline metrics collector KFPMetricParser parses the metrics file +# and returns an ObservationLog of the metrics specified. +# Some documentation on the metrics collector file structure can be found here: +# https://v0-6.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/ from datetime import datetime from logging import getLogger, StreamHandler, INFO +import os from typing import List import json @@ -30,29 +27,38 @@ import api_pb2 from pkg.metricscollector.v1beta1.common import const +class KFPMetricParser: + def __init__(self, metric_names): + self.metric_names = metric_names -def parse_metrics(fn: str) -> List[api_pb2.MetricLog]: - """Parse a kubeflow pipeline metrics file + @staticmethod + def find_all_files(directory): + for root, dirs, files in os.walk(directory): + for f in files: + yield os.path.join(root, f) - Args: - fn (function): path to metrics file + def parse_metrics(self, fn: str) -> List[api_pb2.MetricLog]: + """Parse a kubeflow pipeline metrics file - Returns: - List[api_pb2.MetricLog]: A list of logged metrics - """ - metrics = [] - with open(fn) as f: - metrics_dict = json.load(f) - for m in metrics_dict["metrics"]: - name = m["name"] - value = m["numberValue"] - ml = api_pb2.MetricLog( - time_stamp=rfc3339.rfc3339(datetime.now()), - metric=api_pb2.Metric(name=name, value=str(value)), - ) - metrics.append(ml) - return metrics + Args: + fn (function): path to metrics file + Returns: + List[api_pb2.MetricLog]: A list of logged metrics + """ + metrics = [] + with open(fn) as f: + metrics_dict = json.load(f) + for m in metrics_dict["metrics"]: + name = m["name"] + value = m["numberValue"] + if name in self.metric_names: + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.now()), + metric=api_pb2.Metric(name=name, value=str(value)), + ) + metrics.append(ml) + return metrics class MetricsCollector: def __init__(self, metric_names): @@ -63,10 +69,20 @@ def __init__(self, metric_names): self.logger.addHandler(handler) self.logger.propagate = False self.metrics = metric_names + self.parser = KFPMetricParser(metric_names) - def parse_file(self, filename): - self.logger.info(filename + " will be parsed.") - mls = parse_metrics(filename) + def parse_file(self, directory): + """Parses the Kubeflow Pipeline metrics files""" + mls = [] + for f in self.parser.find_all_files(directory): + if os.path.isdir(f): + continue + try: + self.logger.info(f + " will be parsed.") + mls.extend(self.parser.parse_metrics(f)) + except Exception as e: + self.logger.warning("Unexpected error: " + str(e)) + continue # Metrics logs must contain at least one objective metric value # Objective metric is located at first index From c458541cbe3fff5dffd96557ea649209bef2fd82 Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 21 Jun 2023 14:57:13 +0200 Subject: [PATCH 12/26] Move kfpv1 metricscollector in v1 subfolder As per suggestion this should make it easier to handle the v2 metrics collector in the future as well --- .../v1beta1/kfp-metricscollector/v1/Dockerfile | 4 ++-- .../v1}/__init__.py | 0 .../v1}/metrics_loader.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename pkg/metricscollector/v1beta1/{kfpv1-metricscollector => kfp-metricscollector/v1}/__init__.py (100%) rename pkg/metricscollector/v1beta1/{kfpv1-metricscollector => kfp-metricscollector/v1}/metrics_loader.py (100%) diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile index 21771e8dccf..9d7722e5f30 100644 --- a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile @@ -2,8 +2,8 @@ FROM python:3.10-slim ARG TARGETARCH ENV TARGET_DIR /opt/katib -ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfpv1-metricscollector -ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfpv1-metricscollector/::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ +ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfp-metricscollector/v1 +ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfp-metricscollector/v1::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/ diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/__init__.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py similarity index 100% rename from pkg/metricscollector/v1beta1/kfpv1-metricscollector/__init__.py rename to pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py diff --git a/pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py similarity index 100% rename from pkg/metricscollector/v1beta1/kfpv1-metricscollector/metrics_loader.py rename to pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py From cee997009a9cc6dbc1ef3033dedf2d2ad082d140 Mon Sep 17 00:00:00 2001 From: votti Date: Wed, 21 Jun 2023 15:06:00 +0200 Subject: [PATCH 13/26] Remove duplicated notebook section --- .../kubeflow-kfpv1-opt-mnist.ipynb | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb index 801108b68b1..0c5c4af979f 100644 --- a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -46,32 +46,7 @@ " -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args/-\", \"value\": \"--trial-resources=Workflow.v1alpha1.argoproj.io\"}]'`\n", "\n", "For more details and how to set this up on a partial Kubeflow installation follow:\n", - "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.mdd\n", - "If you are running on a full Kubeflow installation *DO NOT INSTALL ARGO* as this will likely break your installation.\n", - "\n", - "Just run the following commands:\n", - "\n", - "Enable side-car injection:\n", - "\n", - "`kubectl patch namespace argo -p '{\"metadata\":{\"labels\":{\"katib.kubeflow.org/metrics-collector-injection\":\"enabled\"}}}'`\n", - "\n", - "\n", - "Verify that the emissary executor is active (should be default in newer Kubeflow installations):\n", - "\n", - "` kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor`\n", - "\n", - "Patch the Katib controller:\n", - "\n", - "`kubectl patch ClusterRole katib-controller -n kubeflow --type=json \\\n", - " -p='[{\"op\": \"add\", \"path\": \"/rules/-\", \"value\": {\"apiGroups\":[\"argoproj.io\"],\"resources\":[\"workflows\"],\"verbs\":[\"get\", \"list\", \"watch\", \"create\", \"delete\"]}}]'\n", - "`\n", - "\n", - "`kubectl patch Deployment katib-controller -n kubeflow --type=json \\\n", - " -p='[{\"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/args/-\", \"value\": \"--trial-resources=Workflow.v1alpha1.argoproj.io\"}]'`\n", - "\n", - "For more details and how to set this up on a partial Kubeflow installation follow:\n", - "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.md\n", - "\n" + "https://github.com/kubeflow/katib/tree/master/examples/v1beta1/argo/README.mdd" ] }, { From f7e697b2e8d7cb80069405afbcbe5cf3887acad3 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Tue, 18 Jul 2023 21:20:20 +0200 Subject: [PATCH 14/26] Add dependencies for KFPv1 e2e testing This installs Kubeflow pipelines (KFP) if selected to do so in order to run e2e tests where Katib and KFP interact. --- .../v1beta1/scripts/gh-actions/setup-katib.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index e2547e2efad..01b7400fb0a 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -23,10 +23,17 @@ cd "$(dirname "$0")" DEPLOY_KATIB_UI=${1:-false} DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} +DEPLOY_KFP=${4:-false} E2E_TEST_IMAGE_TAG="e2e-test" TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" +KFP_ENV=platform-agnostic-emissary +KFP_BASE_URL="github.com/kubeflow/pipelines/manifests/kustomize" +# This is one of the latest KFPv1 version which was compatible with a +# recent K8s version at the time of writing (eg 1.8.22 gave an error). +KFP_VERSION="1.8.1" + echo "Start to install Katib" # Update Katib images with `e2e-test`. @@ -61,6 +68,17 @@ if "$DEPLOY_TRAINING_OPERATOR"; then kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f - fi +# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines. +# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize +if "$DEPLOY_KFP"; then + echo "Deploying Kubeflow Pipelines version $KFP_VERSION" + kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}" + kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s + kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}" + kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s + #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80 +fi + echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - From 36ed3727701c257f327493e2e012c4f7df7bf51c Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Tue, 18 Jul 2023 21:20:42 +0200 Subject: [PATCH 15/26] TMP: changes to run tests locally This commit should be removed later --- manifests/v1beta1/components/mysql/pvc.yaml | 3 +-- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/manifests/v1beta1/components/mysql/pvc.yaml b/manifests/v1beta1/components/mysql/pvc.yaml index 9249d8c6ea2..152f43bba9a 100644 --- a/manifests/v1beta1/components/mysql/pvc.yaml +++ b/manifests/v1beta1/components/mysql/pvc.yaml @@ -1,4 +1,3 @@ ---- apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -9,4 +8,4 @@ spec: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: 2Gi diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 01b7400fb0a..d0a859f8192 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} DEPLOY_KFP=${4:-false} -E2E_TEST_IMAGE_TAG="e2e-test" +E2E_TEST_IMAGE_TAG="v0.15.0" TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" KFP_ENV=platform-agnostic-emissary @@ -51,12 +51,12 @@ fi # If the user wants to deploy Katib UI, then use the kustomization file for Katib UI. if ! "$DEPLOY_KATIB_UI"; then - index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)" - index="$index" yq eval -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE + index="$(yq -y '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)" + index="$index" yq -y -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE fi # Since e2e test doesn't need to large storage, we use a small PVC for Katib. -yq eval -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE +yq -y -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE echo -e "\n The Katib will be deployed with the following configs" cat $KUSTOMIZATION_FILE From 15c4a4b8fc32a5f339fa1b5d8cc542016d8941b8 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Tue, 18 Jul 2023 21:52:08 +0200 Subject: [PATCH 16/26] Add missing ClusterRole update These permissions are required such that the katib-controller can launch argo workflows. --- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index d0a859f8192..83ef1888de2 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -77,6 +77,7 @@ if "$DEPLOY_KFP"; then kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}" kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80 + kubectl patch ClusterRole katib-controller -n kubeflow --type=json -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]' fi echo "Deploying Katib" From 741059fa5b88f9a13a22247aad552d7cc873f898 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Tue, 18 Jul 2023 22:05:03 +0200 Subject: [PATCH 17/26] Remove accidentally included `self` --- scripts/v1beta1/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 0fc1dd167d4..b4fa896bc2e 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -72,7 +72,7 @@ echo -e "\nBuilding file metrics collector image...\n" docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding kfpv1 metrics collector image...\n" -docker buildx build --platform "linux/${ARCH}" self, -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile . +docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" if [ "${ARCH}" == "ppc64le" ]; then From 7d33b7b11eec7fdc47471ce6b20508ff62660831 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Tue, 18 Jul 2023 22:14:27 +0200 Subject: [PATCH 18/26] Rename paramater to more meaningful name --- .../v1beta1/kfp-metricscollector/v1/metrics_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py index 74e47d6a558..90e1764b7e8 100644 --- a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py +++ b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py @@ -37,7 +37,7 @@ def find_all_files(directory): for f in files: yield os.path.join(root, f) - def parse_metrics(self, fn: str) -> List[api_pb2.MetricLog]: + def parse_metrics(self, metric_file_path: str) -> List[api_pb2.MetricLog]: """Parse a kubeflow pipeline metrics file Args: @@ -47,7 +47,7 @@ def parse_metrics(self, fn: str) -> List[api_pb2.MetricLog]: List[api_pb2.MetricLog]: A list of logged metrics """ metrics = [] - with open(fn) as f: + with open(metric_file_path) as f: metrics_dict = json.load(f) for m in metrics_dict["metrics"]: name = m["name"] From 35df815e014f7ebf4df7ce1ff291eacdab0cb24d Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Thu, 20 Jul 2023 22:41:43 +0200 Subject: [PATCH 19/26] Extend example notebook with simple example for e2e tests This adds a dummy e2e example that can be used to test the main functionality. --- .../kubeflow-kfpv1-opt-mnist.ipynb | 974 +++++++++++++++++- 1 file changed, 938 insertions(+), 36 deletions(-) diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb index 0c5c4af979f..6efb26732a0 100644 --- a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb @@ -77,15 +77,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# Namespace to run the workloads under\n", - "USER_NAMESPACE = \"vito-zanotelli\"\n", + "USER_NAMESPACE = \"kubeflow\" # On a full installation this would be your user namespace\n", "# Pipeline service account\n", "# On a Kubeflow instance on GCP this should be 'default-editor'\n", - "KFP_SERVICE_ACCOUNT = \"default-editor\"\n", + "KFP_SERVICE_ACCOUNT = \"pipeline-runner\"\n", "\n", "\n", "# Consmetic variables\n", @@ -94,7 +94,9 @@ "KFP_RUN = \"mnist-pipeline-v1\"\n", "\n", "# Katib run variables\n", - "KATIB_EXPERIMENT = \"katib-kfp-example-v1\"" + "KATIB_EXPERIMENT = \"katib-kfp-example-v1\"\n", + "KATIB_E2E_EXPERIMENT = \"katib-kfp-example-e2e-v1\"\n", + "KATIB_WORKLFLOW_COLLECTOR_IMAGE = \"docker.io/kubeflowkatib/kfpv1-metrics-collector:latest\" #\"docker.io/votti/kfpv1-metricscollector:v0.0.10\"" ] }, { @@ -107,9 +109,86 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: kfp==1.8.12 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (1.8.12)\n", + "Requirement already satisfied: absl-py<2,>=0.9 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.4.0)\n", + "Requirement already satisfied: PyYAML<6,>=5.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (5.4.1)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (2.10.2)\n", + "Requirement already satisfied: google-cloud-storage<2,>=1.20.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.44.0)\n", + "Requirement already satisfied: kubernetes<19,>=8.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (18.20.0)\n", + "Requirement already satisfied: google-api-python-client<2,>=1.7.8 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.12.11)\n", + "Requirement already satisfied: google-auth<2,>=1.6.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.35.0)\n", + "Requirement already satisfied: requests-toolbelt<1,>=0.8.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.10.1)\n", + "Requirement already satisfied: cloudpickle<3,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (2.2.1)\n", + "Requirement already satisfied: kfp-server-api<2.0.0,>=1.1.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.8.5)\n", + "Requirement already satisfied: jsonschema<4,>=3.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.2.0)\n", + "Requirement already satisfied: tabulate<1,>=0.8.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.9.0)\n", + "Requirement already satisfied: click<9,>=7.1.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (8.1.3)\n", + "Requirement already satisfied: Deprecated<2,>=1.2.7 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.2.14)\n", + "Requirement already satisfied: strip-hints<1,>=0.1.8 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.1.10)\n", + "Requirement already satisfied: docstring-parser<1,>=0.7.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.15)\n", + "Requirement already satisfied: kfp-pipeline-spec<0.2.0,>=0.1.14 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.1.16)\n", + "Requirement already satisfied: fire<1,>=0.3.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.5.0)\n", + "Requirement already satisfied: protobuf<4,>=3.13.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.20.3)\n", + "Requirement already satisfied: uritemplate<4,>=3.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (3.0.1)\n", + "Requirement already satisfied: pydantic<2,>=1.8.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (1.10.9)\n", + "Requirement already satisfied: typer<1.0,>=0.3.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp==1.8.12) (0.9.0)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from Deprecated<2,>=1.2.7->kfp==1.8.12) (1.15.0)\n", + "Requirement already satisfied: six in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from fire<1,>=0.3.1->kfp==1.8.12) (1.16.0)\n", + "Requirement already satisfied: termcolor in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from fire<1,>=0.3.1->kfp==1.8.12) (2.3.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (1.59.1)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (2.31.0)\n", + "Requirement already satisfied: httplib2<1dev,>=0.15.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-python-client<2,>=1.7.8->kfp==1.8.12) (0.22.0)\n", + "Requirement already satisfied: google-auth-httplib2>=0.0.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-api-python-client<2,>=1.7.8->kfp==1.8.12) (0.1.0)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (0.3.0)\n", + "Requirement already satisfied: setuptools>=40.3.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (67.7.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth<2,>=1.6.1->kfp==1.8.12) (4.9)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=1.6.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (2.3.2)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=1.3.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (2.5.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from jsonschema<4,>=3.0.1->kfp==1.8.12) (23.1.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from jsonschema<4,>=3.0.1->kfp==1.8.12) (0.19.3)\n", + "Requirement already satisfied: urllib3>=1.15 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (1.26.15)\n", + "Requirement already satisfied: certifi in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (2023.5.7)\n", + "Requirement already satisfied: python-dateutil in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kfp-server-api<2.0.0,>=1.1.2->kfp==1.8.12) (2.8.2)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes<19,>=8.0.0->kfp==1.8.12) (1.6.0)\n", + "Requirement already satisfied: requests-oauthlib in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes<19,>=8.0.0->kfp==1.8.12) (1.3.1)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pydantic<2,>=1.8.2->kfp==1.8.12) (4.6.3)\n", + "Requirement already satisfied: wheel in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from strip-hints<1,>=0.1.8->kfp==1.8.12) (0.40.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-resumable-media<3.0dev,>=1.3.0->google-cloud-storage<2,>=1.20.0->kfp==1.8.12) (1.5.0)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from httplib2<1dev,>=0.15.0->google-api-python-client<2,>=1.7.8->kfp==1.8.12) (3.1.0)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.1->kfp==1.8.12) (0.5.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->kfp==1.8.12) (3.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests-oauthlib->kubernetes<19,>=8.0.0->kfp==1.8.12) (3.2.2)\n", + "Requirement already satisfied: kubeflow-katib==0.13.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (0.13.0)\n", + "Requirement already satisfied: certifi>=14.05.14 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (2023.5.7)\n", + "Requirement already satisfied: six>=1.10 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (1.16.0)\n", + "Requirement already satisfied: setuptools>=21.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (67.7.2)\n", + "Requirement already satisfied: urllib3>=1.15.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (1.26.15)\n", + "Requirement already satisfied: kubernetes>=12.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubeflow-katib==0.13.0) (18.20.0)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (2.8.2)\n", + "Requirement already satisfied: pyyaml>=5.4.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (5.4.1)\n", + "Requirement already satisfied: google-auth>=1.0.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.35.0)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.6.0)\n", + "Requirement already satisfied: requests in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (2.31.0)\n", + "Requirement already satisfied: requests-oauthlib in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from kubernetes>=12.0.0->kubeflow-katib==0.13.0) (1.3.1)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (0.3.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from requests-oauthlib->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (3.2.2)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /home/vitoz/mambaforge/envs/katibdev/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-katib==0.13.0) (0.5.0)\n" + ] + } + ], "source": [ "# Install required packages (Kubeflow Pipelines and Katib SDK).\n", "!pip install kfp==1.8.12\n", @@ -118,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -142,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -161,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -182,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -253,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -334,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -447,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -461,13 +540,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name=\"Download MNIST dataset\",\n", - " description=\"A pipeline to download the MNIST dataset files\",\n", + " description=\"A pipeline to train MNIST classification from scratch.\",\n", ")\n", "def mnist_training_pipeline(\n", " lr: float = 1e-4,\n", @@ -530,9 +609,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Experiment details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "kfp_run = f\"{KFP_RUN}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", "run = kfp_client.create_run_from_pipeline_func(\n", @@ -543,7 +647,8 @@ " arguments={\"histogram_norm\": \"0\"},\n", " experiment_name=KFP_EXPERIMENT,\n", " run_name=kfp_run,\n", - " namespace=USER_NAMESPACE,\n", + " # In a multiuser setup, provide the namesapce\n", + " #namespace=USER_NAMESPACE,\n", ")" ] }, @@ -569,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -655,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -696,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -736,7 +841,7 @@ " \"-path\",\n", " \"/tmp/outputs/mlpipeline_metrics\",\n", " ],\n", - " \"image\": \"docker.io/kubeflowkatib/kfpv1-metrics-collector:latest\",\n", + " \"image\": KATIB_WORKLFLOW_COLLECTOR_IMAGE,\n", " \"imagePullPolicy\": \"Always\",\n", " \"name\": \"custom-metrics-logger-and-collector\",\n", " \"env\": [\n", @@ -767,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -878,7 +983,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -935,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -965,9 +1070,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'katib_spec' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 11\u001b[0m\n\u001b[1;32m 1\u001b[0m katib_experiment_name \u001b[39m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mKATIB_EXPERIMENT\u001b[39m}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{\u001b[39;00mdt\u001b[39m.\u001b[39mtoday()\u001b[39m.\u001b[39mstrftime(\u001b[39m'\u001b[39m\u001b[39m%\u001b[39m\u001b[39mY-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mm-\u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mHh-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mMm-\u001b[39m\u001b[39m%\u001b[39m\u001b[39mSs\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m )\n\u001b[1;32m 4\u001b[0m katib_experiment \u001b[39m=\u001b[39m V1beta1Experiment(\n\u001b[1;32m 5\u001b[0m api_version\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mkubeflow.org/v1beta1\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m kind\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mExperiment\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m metadata\u001b[39m=\u001b[39mV1ObjectMeta(\n\u001b[1;32m 8\u001b[0m name\u001b[39m=\u001b[39mkatib_experiment_name,\n\u001b[1;32m 9\u001b[0m namespace\u001b[39m=\u001b[39mUSER_NAMESPACE,\n\u001b[1;32m 10\u001b[0m ),\n\u001b[0;32m---> 11\u001b[0m spec\u001b[39m=\u001b[39mkatib_spec,\n\u001b[1;32m 12\u001b[0m )\n", + "\u001b[0;31mNameError\u001b[0m: name 'katib_spec' is not defined" + ] + } + ], "source": [ "katib_experiment_name = (\n", " f\"{KATIB_EXPERIMENT}-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", @@ -993,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1011,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1020,9 +1137,329 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'creationTimestamp': '2023-07-20T19:40:11Z',\n", + " 'generation': 1,\n", + " 'managedFields': [{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'fieldsType': 'FieldsV1',\n", + " 'fieldsV1': {'f:spec': {'.': {},\n", + " 'f:algorithm': {'.': {}, 'f:algorithmName': {}},\n", + " 'f:maxFailedTrialCount': {},\n", + " 'f:maxTrialCount': {},\n", + " 'f:metricsCollectorSpec': {'.': {},\n", + " 'f:collector': {'.': {},\n", + " 'f:customCollector': {'.': {},\n", + " 'f:args': {},\n", + " 'f:env': {},\n", + " 'f:image': {},\n", + " 'f:imagePullPolicy': {},\n", + " 'f:name': {}},\n", + " 'f:kind': {}},\n", + " 'f:source': {'.': {},\n", + " 'f:fileSystemPath': {'.': {}, 'f:kind': {}, 'f:path': {}}}},\n", + " 'f:objective': {'.': {},\n", + " 'f:additionalMetricNames': {},\n", + " 'f:goal': {},\n", + " 'f:objectiveMetricName': {},\n", + " 'f:type': {}},\n", + " 'f:parallelTrialCount': {},\n", + " 'f:parameters': {},\n", + " 'f:trialTemplate': {'.': {},\n", + " 'f:failureCondition': {},\n", + " 'f:primaryContainerName': {},\n", + " 'f:primaryPodLabels': {'.': {},\n", + " 'f:katib.kubeflow.org/model-training': {}},\n", + " 'f:retain': {},\n", + " 'f:successCondition': {},\n", + " 'f:trialParameters': {},\n", + " 'f:trialSpec': {'.': {},\n", + " 'f:apiVersion': {},\n", + " 'f:kind': {},\n", + " 'f:metadata': {'.': {},\n", + " 'f:annotations': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_compilation_time': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_spec': {}},\n", + " 'f:generateName': {},\n", + " 'f:labels': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {}}},\n", + " 'f:spec': {'.': {},\n", + " 'f:arguments': {'.': {}, 'f:parameters': {}},\n", + " 'f:entrypoint': {},\n", + " 'f:serviceAccountName': {},\n", + " 'f:templates': {}}}}}},\n", + " 'manager': 'OpenAPI-Generator',\n", + " 'operation': 'Update',\n", + " 'time': '2023-07-20T19:40:11Z'}],\n", + " 'name': 'katib-kfp-example-v1-2023-07-20-21h-40m-05s',\n", + " 'namespace': 'kubeflow',\n", + " 'resourceVersion': '6526',\n", + " 'uid': '68d7df06-e02d-4d1e-932c-c3032f7ecaff'},\n", + " 'spec': {'algorithm': {'algorithmName': 'random'},\n", + " 'maxFailedTrialCount': 2,\n", + " 'maxTrialCount': 5,\n", + " 'metricsCollectorSpec': {'collector': {'customCollector': {'args': ['-m',\n", + " 'val-accuracy;accuracy',\n", + " '-s',\n", + " 'katib-db-manager.kubeflow:6789',\n", + " '-t',\n", + " '$(PodName)',\n", + " '-path',\n", + " '/tmp/outputs/mlpipeline_metrics'],\n", + " 'env': [{'name': 'PodName',\n", + " 'valueFrom': {'fieldRef': {'fieldPath': 'metadata.name'}}}],\n", + " 'image': 'docker.io/kubeflowkatib/kfpv1-metrics-collector:latest',\n", + " 'imagePullPolicy': 'Always',\n", + " 'name': 'custom-metrics-logger-and-collector',\n", + " 'resources': {}},\n", + " 'kind': 'Custom'},\n", + " 'source': {'fileSystemPath': {'kind': 'File',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}}},\n", + " 'objective': {'additionalMetricNames': ['accuracy'],\n", + " 'goal': 0.9,\n", + " 'metricStrategies': [{'name': 'val-accuracy', 'value': 'max'},\n", + " {'name': 'accuracy', 'value': 'max'}],\n", + " 'objectiveMetricName': 'val-accuracy',\n", + " 'type': 'maximize'},\n", + " 'parallelTrialCount': 5,\n", + " 'parameters': [{'feasibleSpace': {'max': '0.001', 'min': '0.00001'},\n", + " 'name': 'learning_rate',\n", + " 'parameterType': 'double'},\n", + " {'feasibleSpace': {'max': '64', 'min': '16'},\n", + " 'name': 'batch_size',\n", + " 'parameterType': 'int'},\n", + " {'feasibleSpace': {'list': ['0', '1']},\n", + " 'name': 'histogram_norm',\n", + " 'parameterType': 'discrete'}],\n", + " 'resumePolicy': 'Never',\n", + " 'trialTemplate': {'failureCondition': 'status.[@this].#(phase==\"Failed\")#',\n", + " 'primaryContainerName': 'main',\n", + " 'primaryPodLabels': {'katib.kubeflow.org/model-training': 'true'},\n", + " 'successCondition': 'status.[@this].#(phase==\"Succeeded\")#',\n", + " 'trialParameters': [{'description': 'Learning rate for the training model',\n", + " 'name': 'learningRate',\n", + " 'reference': 'learning_rate'},\n", + " {'description': 'Batch size for NN training',\n", + " 'name': 'batchSize',\n", + " 'reference': 'batch_size'},\n", + " {'description': 'Histogram normalization of image on?',\n", + " 'name': 'histogramNorm',\n", + " 'reference': 'histogram_norm'}],\n", + " 'trialSpec': {'apiVersion': 'argoproj.io/v1alpha1',\n", + " 'kind': 'Workflow',\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline_compilation_time': '2023-07-20T21:40:03.664402',\n", + " 'pipelines.kubeflow.org/pipeline_spec': '{\"description\": \"A pipeline to download the MNIST dataset files\", \"inputs\": [{\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"3\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"5\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"${trialParameters.learningRate}\", \"name\": \"lr\"}, {\"default\": \"${trialParameters.batchSize}\", \"name\": \"batch_size\"}, {\"default\": \"${trialParameters.histogramNorm}\", \"name\": \"histogram_norm\"}], \"name\": \"Download MNIST dataset\"}'},\n", + " 'generateName': 'download-mnist-dataset-',\n", + " 'labels': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12'}},\n", + " 'spec': {'arguments': {'parameters': [{'name': 'lr',\n", + " 'value': '${trialParameters.learningRate}'},\n", + " {'name': 'optimizer', 'value': 'Adam'},\n", + " {'name': 'loss', 'value': 'categorical_crossentropy'},\n", + " {'name': 'epochs', 'value': '3'},\n", + " {'name': 'batch_size', 'value': '${trialParameters.batchSize}'},\n", + " {'name': 'histogram_norm',\n", + " 'value': '${trialParameters.histogramNorm}'}]},\n", + " 'entrypoint': 'download-mnist-dataset',\n", + " 'serviceAccountName': 'pipeline-runner',\n", + " 'templates': [{'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-exc',\n", + " 'url=\"$0\"\\noutput_path=\"$1\"\\ncurl_options=\"$2\"\\n\\nmkdir -p \"$(dirname \"$output_path\")\"\\ncurl --get \"$url\" --output \"$output_path\" $curl_options\\n',\n", + " 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',\n", + " '/tmp/outputs/Data/data',\n", + " '--location'],\n", + " 'image': 'byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342'},\n", + " 'metadata': {'annotations': {'author': 'Alexey Volkov ',\n", + " 'canonical_location': 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml',\n", + " 'pipelines.kubeflow.org/arguments.parameters': '{\"Url\": \"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\", \"curl options\": \"--location\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"2f61f2edf713f214934bd286791877a1a3a37f31a4de4368b90e3b76743f1523\", \"url\": \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-exc\", \"url=\\\\\"$0\\\\\"\\\\noutput_path=\\\\\"$1\\\\\"\\\\ncurl_options=\\\\\"$2\\\\\"\\\\n\\\\nmkdir -p \\\\\"$(dirname \\\\\"$output_path\\\\\")\\\\\"\\\\ncurl --get \\\\\"$url\\\\\" --output \\\\\"$output_path\\\\\" $curl_options\\\\n\", {\"inputValue\": \"Url\"}, {\"outputPath\": \"Data\"}, {\"inputValue\": \"curl options\"}], \"image\": \"byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342\"}}, \"inputs\": [{\"name\": \"Url\", \"type\": \"URI\"}, {\"default\": \"--location\", \"description\": \"Additional options given to the curl bprogram. See https://curl.haxx.se/docs/manpage.html\", \"name\": \"curl options\", \"type\": \"string\"}], \"metadata\": {\"annotations\": {\"author\": \"Alexey Volkov \", \"canonical_location\": \"https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml\"}}, \"name\": \"Download data\", \"outputs\": [{\"name\": \"Data\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Download training images'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'download-data',\n", + " 'outputs': {'artifacts': [{'name': 'download-data-Data',\n", + " 'path': '/tmp/outputs/Data/data'}]}},\n", + " {'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-exc',\n", + " 'url=\"$0\"\\noutput_path=\"$1\"\\ncurl_options=\"$2\"\\n\\nmkdir -p \"$(dirname \"$output_path\")\"\\ncurl --get \"$url\" --output \"$output_path\" $curl_options\\n',\n", + " 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',\n", + " '/tmp/outputs/Data/data',\n", + " '--location'],\n", + " 'image': 'byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342'},\n", + " 'metadata': {'annotations': {'author': 'Alexey Volkov ',\n", + " 'canonical_location': 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml',\n", + " 'pipelines.kubeflow.org/arguments.parameters': '{\"Url\": \"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\", \"curl options\": \"--location\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"2f61f2edf713f214934bd286791877a1a3a37f31a4de4368b90e3b76743f1523\", \"url\": \"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-exc\", \"url=\\\\\"$0\\\\\"\\\\noutput_path=\\\\\"$1\\\\\"\\\\ncurl_options=\\\\\"$2\\\\\"\\\\n\\\\nmkdir -p \\\\\"$(dirname \\\\\"$output_path\\\\\")\\\\\"\\\\ncurl --get \\\\\"$url\\\\\" --output \\\\\"$output_path\\\\\" $curl_options\\\\n\", {\"inputValue\": \"Url\"}, {\"outputPath\": \"Data\"}, {\"inputValue\": \"curl options\"}], \"image\": \"byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342\"}}, \"inputs\": [{\"name\": \"Url\", \"type\": \"URI\"}, {\"default\": \"--location\", \"description\": \"Additional options given to the curl bprogram. See https://curl.haxx.se/docs/manpage.html\", \"name\": \"curl options\", \"type\": \"string\"}], \"metadata\": {\"annotations\": {\"author\": \"Alexey Volkov \", \"canonical_location\": \"https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/web/Download/component.yaml\"}}, \"name\": \"Download data\", \"outputs\": [{\"name\": \"Data\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Download training labels'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'download-data-2',\n", + " 'outputs': {'artifacts': [{'name': 'download-data-2-Data',\n", + " 'path': '/tmp/outputs/Data/data'}]}},\n", + " {'dag': {'tasks': [{'name': 'download-data',\n", + " 'template': 'download-data'},\n", + " {'name': 'download-data-2', 'template': 'download-data-2'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.download-data-2.outputs.artifacts.download-data-2-Data}}',\n", + " 'name': 'download-data-2-Data'},\n", + " {'from': '{{tasks.download-data.outputs.artifacts.download-data-Data}}',\n", + " 'name': 'download-data-Data'}]},\n", + " 'dependencies': ['download-data', 'download-data-2'],\n", + " 'name': 'parse-mnist',\n", + " 'template': 'parse-mnist'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.parse-mnist.outputs.artifacts.parse-mnist-Dataset}}',\n", + " 'name': 'parse-mnist-Dataset'}],\n", + " 'parameters': [{'name': 'histogram_norm',\n", + " 'value': '{{inputs.parameters.histogram_norm}}'}]},\n", + " 'dependencies': ['parse-mnist'],\n", + " 'name': 'process',\n", + " 'template': 'process'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.process.outputs.artifacts.process-data_processed}}',\n", + " 'name': 'process-data_processed'}],\n", + " 'parameters': [{'name': 'batch_size',\n", + " 'value': '{{inputs.parameters.batch_size}}'},\n", + " {'name': 'epochs', 'value': '{{inputs.parameters.epochs}}'},\n", + " {'name': 'loss', 'value': '{{inputs.parameters.loss}}'},\n", + " {'name': 'lr', 'value': '{{inputs.parameters.lr}}'},\n", + " {'name': 'optimizer',\n", + " 'value': '{{inputs.parameters.optimizer}}'}]},\n", + " 'dependencies': ['process'],\n", + " 'name': 'train',\n", + " 'template': 'train'}]},\n", + " 'inputs': {'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'histogram_norm'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'name': 'download-mnist-dataset'},\n", + " {'container': {'args': [],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " '# This is how additional packages can be installed dynamically\\npython3 -m pip install pip idx2numpy\\n# Run the rest of the command after installing the packages.\\n\"$0\" \"$@\"\\n',\n", + " 'python3',\n", + " '-u',\n", + " '-c',\n", + " \"import gzip\\nimport idx2numpy\\nimport sys\\nfrom pathlib import Path\\nimport pickle\\nimport tensorflow as tf\\nimg_path = sys.argv[1]\\nlabel_path = sys.argv[2]\\noutput_path = sys.argv[3]\\nwith gzip.open(img_path, 'rb') as f:\\n x = idx2numpy.convert_from_string(f.read())\\nwith gzip.open(label_path, 'rb') as f:\\n y = idx2numpy.convert_from_string(f.read())\\n#one-hot encode the categories\\nx_out = tf.convert_to_tensor(x)\\ny_out = tf.keras.utils.to_categorical(y)\\nPath(output_path).parent.mkdir(parents=True, exist_ok=True)\\nwith open(output_path, 'wb') as output_file:\\n pickle.dump((x_out, y_out), output_file)\\n\",\n", + " '/tmp/inputs/Images/data',\n", + " '/tmp/inputs/Labels/data',\n", + " '/tmp/outputs/Dataset/data'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1'},\n", + " 'inputs': {'artifacts': [{'name': 'download-data-Data',\n", + " 'path': '/tmp/inputs/Images/data'},\n", + " {'name': 'download-data-2-Data', 'path': '/tmp/inputs/Labels/data'}]},\n", + " 'metadata': {'annotations': {'author': 'Vito Zanotelli, D-ONE.ai',\n", + " 'description': 'Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml',\n", + " 'pipelines.kubeflow.org/component_ref': '{\"digest\": \"80825e6ec527562f31b6fdba1bae9a42dae5032c8654f4b9d39cb97a3dc4ed23\"}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"command\": [\"sh\", \"-ec\", \"# This is how additional packages can be installed dynamically\\\\npython3 -m pip install pip idx2numpy\\\\n# Run the rest of the command after installing the packages.\\\\n\\\\\"$0\\\\\" \\\\\"$@\\\\\"\\\\n\", \"python3\", \"-u\", \"-c\", \"import gzip\\\\nimport idx2numpy\\\\nimport sys\\\\nfrom pathlib import Path\\\\nimport pickle\\\\nimport tensorflow as tf\\\\nimg_path = sys.argv[1]\\\\nlabel_path = sys.argv[2]\\\\noutput_path = sys.argv[3]\\\\nwith gzip.open(img_path, \\'rb\\') as f:\\\\n x = idx2numpy.convert_from_string(f.read())\\\\nwith gzip.open(label_path, \\'rb\\') as f:\\\\n y = idx2numpy.convert_from_string(f.read())\\\\n#one-hot encode the categories\\\\nx_out = tf.convert_to_tensor(x)\\\\ny_out = tf.keras.utils.to_categorical(y)\\\\nPath(output_path).parent.mkdir(parents=True, exist_ok=True)\\\\nwith open(output_path, \\'wb\\') as output_file:\\\\n pickle.dump((x_out, y_out), output_file)\\\\n\", {\"inputPath\": \"Images\"}, {\"inputPath\": \"Labels\"}, {\"outputPath\": \"Dataset\"}], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"description\": \"gziped images in the idx format\", \"name\": \"Images\"}, {\"description\": \"gziped labels in the idx format\", \"name\": \"Labels\"}], \"metadata\": {\"annotations\": {\"author\": \"Vito Zanotelli, D-ONE.ai\", \"description\": \"Based on https://github.com/kubeflow/pipelines/blob/master/components/contrib/sample/Python_script/component.yaml\"}}, \"name\": \"Parse MNIST\", \"outputs\": [{\"name\": \"Dataset\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Prepare train dataset'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'parse-mnist',\n", + " 'outputs': {'artifacts': [{'name': 'parse-mnist-Dataset',\n", + " 'path': '/tmp/outputs/Dataset/data'}]}},\n", + " {'container': {'args': ['--data-raw',\n", + " '/tmp/inputs/data_raw/data',\n", + " '--val-pct',\n", + " '0.2',\n", + " '--trainset-flag',\n", + " 'True',\n", + " '--histogram-norm',\n", + " '{{inputs.parameters.histogram_norm}}',\n", + " '--data-processed',\n", + " '/tmp/outputs/data_processed/data'],\n", + " 'command': ['sh',\n", + " '-c',\n", + " '(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' --user) && \"$0\" \"$@\"',\n", + " 'sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef process(\\n data_raw_path, # type: ignore\\n data_processed_path, # type: ignore\\n val_pct = 0.2,\\n trainset_flag = True,\\n histogram_norm = False,\\n):\\n \"\"\"\\n Here we do all the preprocessing\\n if the data path is for training data we:\\n (1) Normalize the data\\n (2) split the train and val data\\n If it is for unseen test data, we:\\n (1) Normalize the data\\n This function returns in any case the processed data path\\n \"\"\"\\n # sklearn\\n import pickle\\n from sklearn.model_selection import train_test_split\\n import tensorflow as tf\\n import tensorflow_addons as tfa\\n\\n def img_norm(x):\\n x_ = tf.reshape(x, list(x.shape) + [1])\\n\\n if histogram_norm:\\n x_ = tfa.image.equalize(x_)\\n\\n # Scale between 0-1\\n x_ = x_ / 255\\n return x_\\n\\n with open(data_raw_path, \"rb\") as f:\\n x, y = pickle.load(f)\\n if trainset_flag:\\n\\n x_ = img_norm(x)\\n x_train, x_val, y_train, y_val = train_test_split(\\n x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\\n )\\n\\n with open(data_processed_path, \"wb\") as output_file:\\n pickle.dump((x_train, y_train, x_val, y_val), output_file)\\n\\n else:\\n x_ = img_norm(x)\\n with open(data_processed_path, \"wb\") as output_file:\\n pickle.dump((x_, y), output_file)\\n\\ndef _deserialize_bool(s) -> bool:\\n from distutils.util import strtobool\\n return strtobool(s) == 1\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Process\\', description=\\'Here we do all the preprocessing\\')\\n_parser.add_argument(\"--data-raw\", dest=\"data_raw_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--val-pct\", dest=\"val_pct\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--trainset-flag\", dest=\"trainset_flag\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--histogram-norm\", dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--data-processed\", dest=\"data_processed_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = process(**_parsed_args)\\n'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1',\n", + " 'resources': {'limits': {'cpu': '1', 'memory': '2Gi'}}},\n", + " 'inputs': {'artifacts': [{'name': 'parse-mnist-Dataset',\n", + " 'path': '/tmp/inputs/data_raw/data'}],\n", + " 'parameters': [{'name': 'histogram_norm'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"histogram_norm\": \"{{inputs.parameters.histogram_norm}}\", \"trainset_flag\": \"True\", \"val_pct\": \"0.2\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"Here we do all the preprocessing\", \"implementation\": {\"container\": {\"args\": [\"--data-raw\", {\"inputPath\": \"data_raw\"}, {\"if\": {\"cond\": {\"isPresent\": \"val_pct\"}, \"then\": [\"--val-pct\", {\"inputValue\": \"val_pct\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"trainset_flag\"}, \"then\": [\"--trainset-flag\", {\"inputValue\": \"trainset_flag\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"histogram_norm\"}, \"then\": [\"--histogram-norm\", {\"inputValue\": \"histogram_norm\"}]}}, \"--data-processed\", {\"outputPath\": \"data_processed\"}], \"command\": [\"sh\", \"-c\", \"(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scikit-learn\\' \\'tensorflow-addons[tensorflow]\\' --user) && \\\\\"$0\\\\\" \\\\\"$@\\\\\"\", \"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef process(\\\\n data_raw_path, # type: ignore\\\\n data_processed_path, # type: ignore\\\\n val_pct = 0.2,\\\\n trainset_flag = True,\\\\n histogram_norm = False,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n Here we do all the preprocessing\\\\n if the data path is for training data we:\\\\n (1) Normalize the data\\\\n (2) split the train and val data\\\\n If it is for unseen test data, we:\\\\n (1) Normalize the data\\\\n This function returns in any case the processed data path\\\\n \\\\\"\\\\\"\\\\\"\\\\n # sklearn\\\\n import pickle\\\\n from sklearn.model_selection import train_test_split\\\\n import tensorflow as tf\\\\n import tensorflow_addons as tfa\\\\n\\\\n def img_norm(x):\\\\n x_ = tf.reshape(x, list(x.shape) + [1])\\\\n\\\\n if histogram_norm:\\\\n x_ = tfa.image.equalize(x_)\\\\n\\\\n # Scale between 0-1\\\\n x_ = x_ / 255\\\\n return x_\\\\n\\\\n with open(data_raw_path, \\\\\"rb\\\\\") as f:\\\\n x, y = pickle.load(f)\\\\n if trainset_flag:\\\\n\\\\n x_ = img_norm(x)\\\\n x_train, x_val, y_train, y_val = train_test_split(\\\\n x_.numpy(), y, test_size=val_pct, stratify=y, random_state=42\\\\n )\\\\n\\\\n with open(data_processed_path, \\\\\"wb\\\\\") as output_file:\\\\n pickle.dump((x_train, y_train, x_val, y_val), output_file)\\\\n\\\\n else:\\\\n x_ = img_norm(x)\\\\n with open(data_processed_path, \\\\\"wb\\\\\") as output_file:\\\\n pickle.dump((x_, y), output_file)\\\\n\\\\ndef _deserialize_bool(s) -> bool:\\\\n from distutils.util import strtobool\\\\n return strtobool(s) == 1\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Process\\', description=\\'Here we do all the preprocessing\\')\\\\n_parser.add_argument(\\\\\"--data-raw\\\\\", dest=\\\\\"data_raw_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--val-pct\\\\\", dest=\\\\\"val_pct\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--trainset-flag\\\\\", dest=\\\\\"trainset_flag\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--histogram-norm\\\\\", dest=\\\\\"histogram_norm\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--data-processed\\\\\", dest=\\\\\"data_processed_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = process(**_parsed_args)\\\\n\"], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"name\": \"data_raw\", \"type\": \"String\"}, {\"default\": \"0.2\", \"name\": \"val_pct\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"True\", \"name\": \"trainset_flag\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}], \"name\": \"Process\", \"outputs\": [{\"name\": \"data_processed\", \"type\": \"String\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Preprocess images'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'process',\n", + " 'outputs': {'artifacts': [{'name': 'process-data_processed',\n", + " 'path': '/tmp/outputs/data_processed/data'}]}},\n", + " {'container': {'args': ['--data-train',\n", + " '/tmp/inputs/data_train/data',\n", + " '--lr',\n", + " '{{inputs.parameters.lr}}',\n", + " '--optimizer',\n", + " '{{inputs.parameters.optimizer}}',\n", + " '--loss',\n", + " '{{inputs.parameters.loss}}',\n", + " '--epochs',\n", + " '{{inputs.parameters.epochs}}',\n", + " '--batch-size',\n", + " '{{inputs.parameters.batch_size}}',\n", + " '--model-out',\n", + " '/tmp/outputs/model_out/data',\n", + " '--mlpipeline-metrics',\n", + " '/tmp/outputs/mlpipeline_metrics/data'],\n", + " 'command': ['sh',\n", + " '-c',\n", + " '(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' --user) && \"$0\" \"$@\"',\n", + " 'sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef train(\\n data_train_path, # type: ignore\\n model_out_path, # type: ignore\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\n lr = 1e-4,\\n optimizer = \"Adam\",\\n loss = \"categorical_crossentropy\",\\n epochs = 1,\\n batch_size = 32,\\n):\\n \"\"\"\\n This is the simulated train part of our ML pipeline where training is performed\\n \"\"\"\\n\\n import tensorflow as tf\\n import pickle\\n from tensorflow.keras.preprocessing.image import ImageDataGenerator\\n import json\\n\\n with open(data_train_path, \"rb\") as f:\\n x_train, y_train, x_val, y_val = pickle.load(f)\\n\\n model = tf.keras.Sequential(\\n [\\n tf.keras.layers.Conv2D(\\n 64, (3, 3), activation=\"relu\", input_shape=(28, 28, 1)\\n ),\\n tf.keras.layers.MaxPooling2D(2, 2),\\n tf.keras.layers.Conv2D(64, (3, 3), activation=\"relu\"),\\n tf.keras.layers.MaxPooling2D(2, 2),\\n tf.keras.layers.Flatten(),\\n tf.keras.layers.Dense(128, activation=\"relu\"),\\n tf.keras.layers.Dense(10, activation=\"softmax\"),\\n ]\\n )\\n\\n if optimizer.lower() == \"sgd\":\\n optimizer = tf.keras.optimizers.SGD(lr)\\n else:\\n optimizer = tf.keras.optimizers.Adam(lr)\\n\\n model.compile(loss=loss, optimizer=optimizer, metrics=[\"accuracy\"])\\n\\n # fit the model\\n model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\\n monitor=\"val_accuracy\", patience=10, verbose=1, restore_best_weights=True\\n )\\n\\n train_datagen = ImageDataGenerator()\\n\\n validation_datagen = ImageDataGenerator()\\n history = model.fit(\\n train_datagen.flow(x_train, y_train, batch_size=batch_size),\\n epochs=epochs,\\n validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\\n shuffle=False,\\n callbacks=[model_early_stopping_callback],\\n )\\n\\n model.save(model_out_path, save_format=\"tf\")\\n\\n metrics = {\\n \"metrics\": [\\n {\\n \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": history.history[\"accuracy\"][\\n -1\\n ], # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n {\\n \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": history.history[\"val_accuracy\"][\\n -1\\n ], # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n ]\\n }\\n with open(mlpipeline_metrics_path, \"w\") as f:\\n json.dump(metrics, f)\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Train\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\n_parser.add_argument(\"--data-train\", dest=\"data_train_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--optimizer\", dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--epochs\", dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--model-out\", dest=\"model_out_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = train(**_parsed_args)\\n'],\n", + " 'image': 'tensorflow/tensorflow:2.7.1',\n", + " 'resources': {'limits': {'cpu': '1', 'memory': '2Gi'}}},\n", + " 'inputs': {'artifacts': [{'name': 'process-data_processed',\n", + " 'path': '/tmp/inputs/data_train/data'}],\n", + " 'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"batch_size\": \"{{inputs.parameters.batch_size}}\", \"epochs\": \"{{inputs.parameters.epochs}}\", \"loss\": \"{{inputs.parameters.loss}}\", \"lr\": \"{{inputs.parameters.lr}}\", \"optimizer\": \"{{inputs.parameters.optimizer}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"This is the simulated train part of our ML pipeline where training is performed\", \"implementation\": {\"container\": {\"args\": [\"--data-train\", {\"inputPath\": \"data_train\"}, {\"if\": {\"cond\": {\"isPresent\": \"lr\"}, \"then\": [\"--lr\", {\"inputValue\": \"lr\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"optimizer\"}, \"then\": [\"--optimizer\", {\"inputValue\": \"optimizer\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"loss\"}, \"then\": [\"--loss\", {\"inputValue\": \"loss\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"epochs\"}, \"then\": [\"--epochs\", {\"inputValue\": \"epochs\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"batch_size\"}, \"then\": [\"--batch-size\", {\"inputValue\": \"batch_size\"}]}}, \"--model-out\", {\"outputPath\": \"model_out\"}, \"--mlpipeline-metrics\", {\"outputPath\": \"mlpipeline_metrics\"}], \"command\": [\"sh\", \"-c\", \"(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location \\'scipy\\' --user) && \\\\\"$0\\\\\" \\\\\"$@\\\\\"\", \"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef train(\\\\n data_train_path, # type: ignore\\\\n model_out_path, # type: ignore\\\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\\\n lr = 1e-4,\\\\n optimizer = \\\\\"Adam\\\\\",\\\\n loss = \\\\\"categorical_crossentropy\\\\\",\\\\n epochs = 1,\\\\n batch_size = 32,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n This is the simulated train part of our ML pipeline where training is performed\\\\n \\\\\"\\\\\"\\\\\"\\\\n\\\\n import tensorflow as tf\\\\n import pickle\\\\n from tensorflow.keras.preprocessing.image import ImageDataGenerator\\\\n import json\\\\n\\\\n with open(data_train_path, \\\\\"rb\\\\\") as f:\\\\n x_train, y_train, x_val, y_val = pickle.load(f)\\\\n\\\\n model = tf.keras.Sequential(\\\\n [\\\\n tf.keras.layers.Conv2D(\\\\n 64, (3, 3), activation=\\\\\"relu\\\\\", input_shape=(28, 28, 1)\\\\n ),\\\\n tf.keras.layers.MaxPooling2D(2, 2),\\\\n tf.keras.layers.Conv2D(64, (3, 3), activation=\\\\\"relu\\\\\"),\\\\n tf.keras.layers.MaxPooling2D(2, 2),\\\\n tf.keras.layers.Flatten(),\\\\n tf.keras.layers.Dense(128, activation=\\\\\"relu\\\\\"),\\\\n tf.keras.layers.Dense(10, activation=\\\\\"softmax\\\\\"),\\\\n ]\\\\n )\\\\n\\\\n if optimizer.lower() == \\\\\"sgd\\\\\":\\\\n optimizer = tf.keras.optimizers.SGD(lr)\\\\n else:\\\\n optimizer = tf.keras.optimizers.Adam(lr)\\\\n\\\\n model.compile(loss=loss, optimizer=optimizer, metrics=[\\\\\"accuracy\\\\\"])\\\\n\\\\n # fit the model\\\\n model_early_stopping_callback = tf.keras.callbacks.EarlyStopping(\\\\n monitor=\\\\\"val_accuracy\\\\\", patience=10, verbose=1, restore_best_weights=True\\\\n )\\\\n\\\\n train_datagen = ImageDataGenerator()\\\\n\\\\n validation_datagen = ImageDataGenerator()\\\\n history = model.fit(\\\\n train_datagen.flow(x_train, y_train, batch_size=batch_size),\\\\n epochs=epochs,\\\\n validation_data=validation_datagen.flow(x_val, y_val, batch_size=batch_size),\\\\n shuffle=False,\\\\n callbacks=[model_early_stopping_callback],\\\\n )\\\\n\\\\n model.save(model_out_path, save_format=\\\\\"tf\\\\\")\\\\n\\\\n metrics = {\\\\n \\\\\"metrics\\\\\": [\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": history.history[\\\\\"accuracy\\\\\"][\\\\n -1\\\\n ], # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"val-accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": history.history[\\\\\"val_accuracy\\\\\"][\\\\n -1\\\\n ], # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n ]\\\\n }\\\\n with open(mlpipeline_metrics_path, \\\\\"w\\\\\") as f:\\\\n json.dump(metrics, f)\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Train\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\\\n_parser.add_argument(\\\\\"--data-train\\\\\", dest=\\\\\"data_train_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--lr\\\\\", dest=\\\\\"lr\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--optimizer\\\\\", dest=\\\\\"optimizer\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--loss\\\\\", dest=\\\\\"loss\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--epochs\\\\\", dest=\\\\\"epochs\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--batch-size\\\\\", dest=\\\\\"batch_size\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--model-out\\\\\", dest=\\\\\"model_out_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--mlpipeline-metrics\\\\\", dest=\\\\\"mlpipeline_metrics_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = train(**_parsed_args)\\\\n\"], \"image\": \"tensorflow/tensorflow:2.7.1\"}}, \"inputs\": [{\"name\": \"data_train\", \"type\": \"String\"}, {\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"1\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"32\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}], \"name\": \"Train\", \"outputs\": [{\"name\": \"model_out\", \"type\": \"String\"}, {\"name\": \"mlpipeline_metrics\", \"type\": \"Metrics\"}]}',\n", + " 'pipelines.kubeflow.org/max_cache_staleness': 'P0D',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Fit the model'},\n", + " 'labels': {'katib.kubeflow.org/model-training': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'train',\n", + " 'outputs': {'artifacts': [{'name': 'mlpipeline-metrics',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'},\n", + " {'name': 'train-model_out',\n", + " 'path': '/tmp/outputs/model_out/data'}]}}]}}}}}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "katib_client.create_experiment(katib_experiment)" ] @@ -1039,13 +1476,478 @@ "\n", "`kubectl get Workflow -n `" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Minimal example pipeline for e2e testing\n", + "\n", + "The following part generates a minimal Katib Experiment for e2e testing" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "def prep_e2e(\n", + " output_nr_path: OutputPath(int), # type: ignore # noqa: F821\n", + " histogram_norm: bool = True,\n", + "):\n", + " with open(output_nr_path, 'w') as writer:\n", + " writer.write(str(int(histogram_norm)))\n", + " \n", + "prep_e2e_op = create_component_from_func(\n", + " func=prep_e2e\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "def train_e2e(\n", + " input_nr_path: InputPath(int), # type: ignore # noqa: F821\n", + " mlpipeline_metrics_path: OutputPath(\"Metrics\"), # type: ignore # noqa: F821\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 1,\n", + " batch_size: int = 32,\n", + "):\n", + " \"\"\"\n", + " This is the simulated train part of our ML pipeline where training is performed\n", + " \"\"\"\n", + " import json \n", + " import time\n", + " with open(input_nr_path, 'r') as reader:\n", + " line = reader.readline()\n", + " histogram_norm_value = int(line)\n", + "\n", + " accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\n", + " val_accuracy = accuracy * 0.9\n", + " metrics = {\n", + " \"metrics\": [\n", + " {\n", + " \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": accuracy, # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " {\n", + " \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\n", + " \"numberValue\": val_accuracy, # The value of the metric. Must be a numeric value.\n", + " \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\n", + " },\n", + " ]\n", + " }\n", + " with open(mlpipeline_metrics_path, \"w\") as f:\n", + " json.dump(metrics, f)\n", + " \n", + " # If this step is to fast, the metrics collector fails as the\n", + " # pod is already finished before it can collect the metrics.\n", + " time.sleep(10)\n", + "\n", + "\n", + "train_e2e_op = create_component_from_func(\n", + " func=train_e2e\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Minimal KFP1 pipeline for e2e testing\",\n", + " description=\"\",\n", + ")\n", + "def e2e_example_pipeline(\n", + " lr: float = 1e-4,\n", + " optimizer: str = \"Adam\",\n", + " loss: str = \"categorical_crossentropy\",\n", + " epochs: int = 3,\n", + " batch_size: int = 5,\n", + " histogram_norm: bool = False,\n", + "):\n", + " prep_e2e_output = (\n", + " prep_e2e_op(\n", + " histogram_norm=histogram_norm,\n", + " )\n", + " .set_display_name(\"Prepare a dummy output that should be cached\")\n", + " )\n", + " _label_cache(prep_e2e_output)\n", + "\n", + " training_output = (\n", + " train_e2e_op(\n", + " prep_e2e_output.output,\n", + " lr=lr,\n", + " optimizer=optimizer,\n", + " epochs=epochs,\n", + " batch_size=batch_size,\n", + " loss=loss,\n", + " )\n", + " )\n", + " training_output.set_display_name(\"Generate dummy metrics\")\n", + " # This pod label indicates which pod Katib should collect the metric from.\n", + " # A metrics collecting sidecar container will be added\n", + " training_output.add_pod_label(\"katib.kubeflow.org/model-training\", \"true\")\n", + " # This step needs to run always, as otherwise the metrics for Katib could not\n", + " # be collected.\n", + " training_output.execution_options.caching_strategy.max_cache_staleness = \"P0D\"" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Experiment details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run details." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kfp_run = f\"e2e-example-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + "run = kfp_client.create_run_from_pipeline_func(\n", + " e2e_example_pipeline,\n", + " mode=kfp.dsl.PipelineExecutionMode.V1_LEGACY,\n", + " # You can optionally override your pipeline_root when submitting the run too:\n", + " # pipeline_root='gs://my-pipeline-root/example-pipeline',\n", + " arguments={\"histogram_norm\": \"0\"},\n", + " experiment_name=KFP_EXPERIMENT,\n", + " run_name=kfp_run,\n", + " # In a multiuser setup, provide the namesapce\n", + " #namespace=USER_NAMESPACE,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the full spec\n", + "\n", + "katib_e2e_spec = create_katib_experiment_spec(\n", + " pipeline=e2e_example_pipeline,\n", + " pipeline_params=pipeline_params,\n", + " trial_params=trial_params_specs,\n", + " trial_params_space=parameter_space,\n", + " objective=objective,\n", + " algorithm=algorithm,\n", + " pipeline_service_account=KFP_SERVICE_ACCOUNT,\n", + " max_trial_count=5,\n", + " parallel_trial_count=5,\n", + " retain_pods=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the experiment\n", + "\n", + "katib_e2e_experiment_name = (\n", + " f\"katib-e2e-{dt.today().strftime('%Y-%m-%d-%Hh-%Mm-%Ss')}\"\n", + ")\n", + "katib_e2e_experiment = V1beta1Experiment(\n", + " api_version=\"kubeflow.org/v1beta1\",\n", + " kind=\"Experiment\",\n", + " metadata=V1ObjectMeta(\n", + " name=katib_e2e_experiment_name,\n", + " namespace=USER_NAMESPACE,\n", + " ),\n", + " spec=katib_e2e_spec,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "with open(f\"{KATIB_E2E_EXPERIMENT}.yaml\", \"w\") as f:\n", + " yaml.dump(ApiClient().sanitize_for_serialization(katib_e2e_experiment), f)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'creationTimestamp': '2023-07-20T20:37:59Z',\n", + " 'generation': 1,\n", + " 'managedFields': [{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'fieldsType': 'FieldsV1',\n", + " 'fieldsV1': {'f:spec': {'.': {},\n", + " 'f:algorithm': {'.': {}, 'f:algorithmName': {}},\n", + " 'f:maxFailedTrialCount': {},\n", + " 'f:maxTrialCount': {},\n", + " 'f:metricsCollectorSpec': {'.': {},\n", + " 'f:collector': {'.': {},\n", + " 'f:customCollector': {'.': {},\n", + " 'f:args': {},\n", + " 'f:env': {},\n", + " 'f:image': {},\n", + " 'f:imagePullPolicy': {},\n", + " 'f:name': {}},\n", + " 'f:kind': {}},\n", + " 'f:source': {'.': {},\n", + " 'f:fileSystemPath': {'.': {}, 'f:kind': {}, 'f:path': {}}}},\n", + " 'f:objective': {'.': {},\n", + " 'f:additionalMetricNames': {},\n", + " 'f:goal': {},\n", + " 'f:objectiveMetricName': {},\n", + " 'f:type': {}},\n", + " 'f:parallelTrialCount': {},\n", + " 'f:parameters': {},\n", + " 'f:trialTemplate': {'.': {},\n", + " 'f:failureCondition': {},\n", + " 'f:primaryContainerName': {},\n", + " 'f:primaryPodLabels': {'.': {},\n", + " 'f:katib.kubeflow.org/model-training': {}},\n", + " 'f:retain': {},\n", + " 'f:successCondition': {},\n", + " 'f:trialParameters': {},\n", + " 'f:trialSpec': {'.': {},\n", + " 'f:apiVersion': {},\n", + " 'f:kind': {},\n", + " 'f:metadata': {'.': {},\n", + " 'f:annotations': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_compilation_time': {},\n", + " 'f:pipelines.kubeflow.org/pipeline_spec': {}},\n", + " 'f:generateName': {},\n", + " 'f:labels': {'.': {},\n", + " 'f:pipelines.kubeflow.org/kfp_sdk_version': {}}},\n", + " 'f:spec': {'.': {},\n", + " 'f:arguments': {'.': {}, 'f:parameters': {}},\n", + " 'f:entrypoint': {},\n", + " 'f:serviceAccountName': {},\n", + " 'f:templates': {}}}}}},\n", + " 'manager': 'OpenAPI-Generator',\n", + " 'operation': 'Update',\n", + " 'time': '2023-07-20T20:37:59Z'}],\n", + " 'name': 'katib-e2e-2023-07-20-22h-37m-57s',\n", + " 'namespace': 'kubeflow',\n", + " 'resourceVersion': '11759',\n", + " 'uid': 'c91aa6c9-8a2b-434d-9ab8-c4a317210893'},\n", + " 'spec': {'algorithm': {'algorithmName': 'random'},\n", + " 'maxFailedTrialCount': 2,\n", + " 'maxTrialCount': 5,\n", + " 'metricsCollectorSpec': {'collector': {'customCollector': {'args': ['-m',\n", + " 'val-accuracy;accuracy',\n", + " '-s',\n", + " 'katib-db-manager.kubeflow:6789',\n", + " '-t',\n", + " '$(PodName)',\n", + " '-path',\n", + " '/tmp/outputs/mlpipeline_metrics'],\n", + " 'env': [{'name': 'PodName',\n", + " 'valueFrom': {'fieldRef': {'fieldPath': 'metadata.name'}}}],\n", + " 'image': 'docker.io/votti/kfpv1-metricscollector:v0.0.10',\n", + " 'imagePullPolicy': 'Always',\n", + " 'name': 'custom-metrics-logger-and-collector',\n", + " 'resources': {}},\n", + " 'kind': 'Custom'},\n", + " 'source': {'fileSystemPath': {'kind': 'File',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}}},\n", + " 'objective': {'additionalMetricNames': ['accuracy'],\n", + " 'goal': 0.9,\n", + " 'metricStrategies': [{'name': 'val-accuracy', 'value': 'max'},\n", + " {'name': 'accuracy', 'value': 'max'}],\n", + " 'objectiveMetricName': 'val-accuracy',\n", + " 'type': 'maximize'},\n", + " 'parallelTrialCount': 5,\n", + " 'parameters': [{'feasibleSpace': {'max': '0.001', 'min': '0.00001'},\n", + " 'name': 'learning_rate',\n", + " 'parameterType': 'double'},\n", + " {'feasibleSpace': {'max': '64', 'min': '16'},\n", + " 'name': 'batch_size',\n", + " 'parameterType': 'int'},\n", + " {'feasibleSpace': {'list': ['0', '1']},\n", + " 'name': 'histogram_norm',\n", + " 'parameterType': 'discrete'}],\n", + " 'resumePolicy': 'Never',\n", + " 'trialTemplate': {'failureCondition': 'status.[@this].#(phase==\"Failed\")#',\n", + " 'primaryContainerName': 'main',\n", + " 'primaryPodLabels': {'katib.kubeflow.org/model-training': 'true'},\n", + " 'successCondition': 'status.[@this].#(phase==\"Succeeded\")#',\n", + " 'trialParameters': [{'description': 'Learning rate for the training model',\n", + " 'name': 'learningRate',\n", + " 'reference': 'learning_rate'},\n", + " {'description': 'Batch size for NN training',\n", + " 'name': 'batchSize',\n", + " 'reference': 'batch_size'},\n", + " {'description': 'Histogram normalization of image on?',\n", + " 'name': 'histogramNorm',\n", + " 'reference': 'histogram_norm'}],\n", + " 'trialSpec': {'apiVersion': 'argoproj.io/v1alpha1',\n", + " 'kind': 'Workflow',\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline_compilation_time': '2023-07-20T22:37:57.355215',\n", + " 'pipelines.kubeflow.org/pipeline_spec': '{\"inputs\": [{\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"3\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"5\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"False\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}, {\"default\": \"${trialParameters.learningRate}\", \"name\": \"lr\"}, {\"default\": \"${trialParameters.batchSize}\", \"name\": \"batch_size\"}, {\"default\": \"${trialParameters.histogramNorm}\", \"name\": \"histogram_norm\"}], \"name\": \"Minimal KFP1 pipeline for e2e testing\"}'},\n", + " 'generateName': 'minimal-kfp1-pipeline-for-e2e-testing-',\n", + " 'labels': {'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12'}},\n", + " 'spec': {'arguments': {'parameters': [{'name': 'lr',\n", + " 'value': '${trialParameters.learningRate}'},\n", + " {'name': 'optimizer', 'value': 'Adam'},\n", + " {'name': 'loss', 'value': 'categorical_crossentropy'},\n", + " {'name': 'epochs', 'value': '3'},\n", + " {'name': 'batch_size', 'value': '${trialParameters.batchSize}'},\n", + " {'name': 'histogram_norm',\n", + " 'value': '${trialParameters.histogramNorm}'}]},\n", + " 'entrypoint': 'minimal-kfp1-pipeline-for-e2e-testing',\n", + " 'serviceAccountName': 'pipeline-runner',\n", + " 'templates': [{'dag': {'tasks': [{'arguments': {'parameters': [{'name': 'histogram_norm',\n", + " 'value': '{{inputs.parameters.histogram_norm}}'}]},\n", + " 'name': 'prep-e2e',\n", + " 'template': 'prep-e2e'},\n", + " {'arguments': {'artifacts': [{'from': '{{tasks.prep-e2e.outputs.artifacts.prep-e2e-output_nr}}',\n", + " 'name': 'prep-e2e-output_nr'}],\n", + " 'parameters': [{'name': 'batch_size',\n", + " 'value': '{{inputs.parameters.batch_size}}'},\n", + " {'name': 'epochs', 'value': '{{inputs.parameters.epochs}}'},\n", + " {'name': 'loss', 'value': '{{inputs.parameters.loss}}'},\n", + " {'name': 'lr', 'value': '{{inputs.parameters.lr}}'},\n", + " {'name': 'optimizer',\n", + " 'value': '{{inputs.parameters.optimizer}}'}]},\n", + " 'dependencies': ['prep-e2e'],\n", + " 'name': 'train-e2e',\n", + " 'template': 'train-e2e'}]},\n", + " 'inputs': {'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'histogram_norm'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'name': 'minimal-kfp1-pipeline-for-e2e-testing'},\n", + " {'container': {'args': ['--histogram-norm',\n", + " '{{inputs.parameters.histogram_norm}}',\n", + " '--output-nr',\n", + " '/tmp/outputs/output_nr/data'],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef prep_e2e(\\n output_nr_path, # type: ignore # noqa: F821\\n histogram_norm = True,\\n):\\n with open(output_nr_path, \\'w\\') as writer:\\n writer.write(str(int(histogram_norm)))\\n\\ndef _deserialize_bool(s) -> bool:\\n from distutils.util import strtobool\\n return strtobool(s) == 1\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Prep e2e\\', description=\\'\\')\\n_parser.add_argument(\"--histogram-norm\", dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--output-nr\", dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = prep_e2e(**_parsed_args)\\n'],\n", + " 'image': 'python:3.7'},\n", + " 'inputs': {'parameters': [{'name': 'histogram_norm'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"histogram_norm\": \"{{inputs.parameters.histogram_norm}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"implementation\": {\"container\": {\"args\": [{\"if\": {\"cond\": {\"isPresent\": \"histogram_norm\"}, \"then\": [\"--histogram-norm\", {\"inputValue\": \"histogram_norm\"}]}}, \"--output-nr\", {\"outputPath\": \"output_nr\"}], \"command\": [\"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef prep_e2e(\\\\n output_nr_path, # type: ignore # noqa: F821\\\\n histogram_norm = True,\\\\n):\\\\n with open(output_nr_path, \\'w\\') as writer:\\\\n writer.write(str(int(histogram_norm)))\\\\n\\\\ndef _deserialize_bool(s) -> bool:\\\\n from distutils.util import strtobool\\\\n return strtobool(s) == 1\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Prep e2e\\', description=\\'\\')\\\\n_parser.add_argument(\\\\\"--histogram-norm\\\\\", dest=\\\\\"histogram_norm\\\\\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--output-nr\\\\\", dest=\\\\\"output_nr_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = prep_e2e(**_parsed_args)\\\\n\"], \"image\": \"python:3.7\"}}, \"inputs\": [{\"default\": \"True\", \"name\": \"histogram_norm\", \"optional\": true, \"type\": \"Boolean\"}], \"name\": \"Prep e2e\", \"outputs\": [{\"name\": \"output_nr\", \"type\": \"Integer\"}]}',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Prepare a dummy output that should be cached'},\n", + " 'labels': {'pipelines.kubeflow.org/cache_enabled': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'prep-e2e',\n", + " 'outputs': {'artifacts': [{'name': 'prep-e2e-output_nr',\n", + " 'path': '/tmp/outputs/output_nr/data'}]}},\n", + " {'container': {'args': ['--input-nr',\n", + " '/tmp/inputs/input_nr/data',\n", + " '--lr',\n", + " '{{inputs.parameters.lr}}',\n", + " '--optimizer',\n", + " '{{inputs.parameters.optimizer}}',\n", + " '--loss',\n", + " '{{inputs.parameters.loss}}',\n", + " '--epochs',\n", + " '{{inputs.parameters.epochs}}',\n", + " '--batch-size',\n", + " '{{inputs.parameters.batch_size}}',\n", + " '--mlpipeline-metrics',\n", + " '/tmp/outputs/mlpipeline_metrics/data'],\n", + " 'command': ['sh',\n", + " '-ec',\n", + " 'program_path=$(mktemp)\\nprintf \"%s\" \"$0\" > \"$program_path\"\\npython3 -u \"$program_path\" \"$@\"\\n',\n", + " 'def _make_parent_dirs_and_return_path(file_path: str):\\n import os\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\n return file_path\\n\\ndef train_e2e(\\n input_nr_path, # type: ignore # noqa: F821\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\n lr = 1e-4,\\n optimizer = \"Adam\",\\n loss = \"categorical_crossentropy\",\\n epochs = 1,\\n batch_size = 32,\\n):\\n \"\"\"\\n This is the simulated train part of our ML pipeline where training is performed\\n \"\"\"\\n import json \\n import time\\n with open(input_nr_path, \\'r\\') as reader:\\n line = reader.readline()\\n histogram_norm_value = int(line)\\n\\n accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\\n val_accuracy = accuracy * 0.9\\n metrics = {\\n \"metrics\": [\\n {\\n \"name\": \"accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": accuracy, # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n {\\n \"name\": \"val-accuracy\", # The name of the metric. Visualized as the column name in the runs table.\\n \"numberValue\": val_accuracy, # The value of the metric. Must be a numeric value.\\n \"format\": \"PERCENTAGE\", # The optional format of the metric. Supported values are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed in percentage format).\\n },\\n ]\\n }\\n with open(mlpipeline_metrics_path, \"w\") as f:\\n json.dump(metrics, f)\\n\\n # If this step is to fast, the metrics collector fails as the\\n # pod is already finished before it can collect the metrics.\\n time.sleep(10)\\n\\nimport argparse\\n_parser = argparse.ArgumentParser(prog=\\'Train e2e\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\n_parser.add_argument(\"--input-nr\", dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--optimizer\", dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--epochs\", dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\\n_parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\n_parsed_args = vars(_parser.parse_args())\\n\\n_outputs = train_e2e(**_parsed_args)\\n'],\n", + " 'image': 'python:3.7'},\n", + " 'inputs': {'artifacts': [{'name': 'prep-e2e-output_nr',\n", + " 'path': '/tmp/inputs/input_nr/data'}],\n", + " 'parameters': [{'name': 'batch_size'},\n", + " {'name': 'epochs'},\n", + " {'name': 'loss'},\n", + " {'name': 'lr'},\n", + " {'name': 'optimizer'}]},\n", + " 'metadata': {'annotations': {'pipelines.kubeflow.org/arguments.parameters': '{\"batch_size\": \"{{inputs.parameters.batch_size}}\", \"epochs\": \"{{inputs.parameters.epochs}}\", \"loss\": \"{{inputs.parameters.loss}}\", \"lr\": \"{{inputs.parameters.lr}}\", \"optimizer\": \"{{inputs.parameters.optimizer}}\"}',\n", + " 'pipelines.kubeflow.org/component_ref': '{}',\n", + " 'pipelines.kubeflow.org/component_spec': '{\"description\": \"This is the simulated train part of our ML pipeline where training is performed\", \"implementation\": {\"container\": {\"args\": [\"--input-nr\", {\"inputPath\": \"input_nr\"}, {\"if\": {\"cond\": {\"isPresent\": \"lr\"}, \"then\": [\"--lr\", {\"inputValue\": \"lr\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"optimizer\"}, \"then\": [\"--optimizer\", {\"inputValue\": \"optimizer\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"loss\"}, \"then\": [\"--loss\", {\"inputValue\": \"loss\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"epochs\"}, \"then\": [\"--epochs\", {\"inputValue\": \"epochs\"}]}}, {\"if\": {\"cond\": {\"isPresent\": \"batch_size\"}, \"then\": [\"--batch-size\", {\"inputValue\": \"batch_size\"}]}}, \"--mlpipeline-metrics\", {\"outputPath\": \"mlpipeline_metrics\"}], \"command\": [\"sh\", \"-ec\", \"program_path=$(mktemp)\\\\nprintf \\\\\"%s\\\\\" \\\\\"$0\\\\\" > \\\\\"$program_path\\\\\"\\\\npython3 -u \\\\\"$program_path\\\\\" \\\\\"$@\\\\\"\\\\n\", \"def _make_parent_dirs_and_return_path(file_path: str):\\\\n import os\\\\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\\\\n return file_path\\\\n\\\\ndef train_e2e(\\\\n input_nr_path, # type: ignore # noqa: F821\\\\n mlpipeline_metrics_path, # type: ignore # noqa: F821\\\\n lr = 1e-4,\\\\n optimizer = \\\\\"Adam\\\\\",\\\\n loss = \\\\\"categorical_crossentropy\\\\\",\\\\n epochs = 1,\\\\n batch_size = 32,\\\\n):\\\\n \\\\\"\\\\\"\\\\\"\\\\n This is the simulated train part of our ML pipeline where training is performed\\\\n \\\\\"\\\\\"\\\\\"\\\\n import json \\\\n import time\\\\n with open(input_nr_path, \\'r\\') as reader:\\\\n line = reader.readline()\\\\n histogram_norm_value = int(line)\\\\n\\\\n accuracy = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\\\\n val_accuracy = accuracy * 0.9\\\\n metrics = {\\\\n \\\\\"metrics\\\\\": [\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": accuracy, # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n {\\\\n \\\\\"name\\\\\": \\\\\"val-accuracy\\\\\", # The name of the metric. Visualized as the column name in the runs table.\\\\n \\\\\"numberValue\\\\\": val_accuracy, # The value of the metric. Must be a numeric value.\\\\n \\\\\"format\\\\\": \\\\\"PERCENTAGE\\\\\", # The optional format of the metric. Supported values are \\\\\"RAW\\\\\" (displayed in raw format) and \\\\\"PERCENTAGE\\\\\" (displayed in percentage format).\\\\n },\\\\n ]\\\\n }\\\\n with open(mlpipeline_metrics_path, \\\\\"w\\\\\") as f:\\\\n json.dump(metrics, f)\\\\n\\\\n # If this step is to fast, the metrics collector fails as the\\\\n # pod is already finished before it can collect the metrics.\\\\n time.sleep(10)\\\\n\\\\nimport argparse\\\\n_parser = argparse.ArgumentParser(prog=\\'Train e2e\\', description=\\'This is the simulated train part of our ML pipeline where training is performed\\')\\\\n_parser.add_argument(\\\\\"--input-nr\\\\\", dest=\\\\\"input_nr_path\\\\\", type=str, required=True, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--lr\\\\\", dest=\\\\\"lr\\\\\", type=float, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--optimizer\\\\\", dest=\\\\\"optimizer\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--loss\\\\\", dest=\\\\\"loss\\\\\", type=str, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--epochs\\\\\", dest=\\\\\"epochs\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--batch-size\\\\\", dest=\\\\\"batch_size\\\\\", type=int, required=False, default=argparse.SUPPRESS)\\\\n_parser.add_argument(\\\\\"--mlpipeline-metrics\\\\\", dest=\\\\\"mlpipeline_metrics_path\\\\\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\\\\n_parsed_args = vars(_parser.parse_args())\\\\n\\\\n_outputs = train_e2e(**_parsed_args)\\\\n\"], \"image\": \"python:3.7\"}}, \"inputs\": [{\"name\": \"input_nr\", \"type\": \"Integer\"}, {\"default\": \"0.0001\", \"name\": \"lr\", \"optional\": true, \"type\": \"Float\"}, {\"default\": \"Adam\", \"name\": \"optimizer\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"categorical_crossentropy\", \"name\": \"loss\", \"optional\": true, \"type\": \"String\"}, {\"default\": \"1\", \"name\": \"epochs\", \"optional\": true, \"type\": \"Integer\"}, {\"default\": \"32\", \"name\": \"batch_size\", \"optional\": true, \"type\": \"Integer\"}], \"name\": \"Train e2e\", \"outputs\": [{\"name\": \"mlpipeline_metrics\", \"type\": \"Metrics\"}]}',\n", + " 'pipelines.kubeflow.org/max_cache_staleness': 'P0D',\n", + " 'pipelines.kubeflow.org/task_display_name': 'Generate dummy metrics'},\n", + " 'labels': {'katib.kubeflow.org/model-training': 'true',\n", + " 'pipelines.kubeflow.org/enable_caching': 'true',\n", + " 'pipelines.kubeflow.org/kfp_sdk_version': '1.8.12',\n", + " 'pipelines.kubeflow.org/pipeline-sdk-type': 'kfp'}},\n", + " 'name': 'train-e2e',\n", + " 'outputs': {'artifacts': [{'name': 'mlpipeline-metrics',\n", + " 'path': '/tmp/outputs/mlpipeline_metrics/data'}]}}]}}}}}" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "katib_client.create_experiment(katib_e2e_experiment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "katib-exp", + "display_name": "katibdev", "language": "python", - "name": "python3" + "name": "katibdev" }, "language_info": { "codemirror_mode": { @@ -1057,7 +1959,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.0" }, "vscode": { "interpreter": { From 0504085785f87637adf1742a6e8a81c321042f8b Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Thu, 20 Jul 2023 22:42:44 +0200 Subject: [PATCH 20/26] Revert "TMP: changes to run tests locally" This reverts commit 36ed3727701c257f327493e2e012c4f7df7bf51c. --- manifests/v1beta1/components/mysql/pvc.yaml | 3 ++- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/manifests/v1beta1/components/mysql/pvc.yaml b/manifests/v1beta1/components/mysql/pvc.yaml index 152f43bba9a..9249d8c6ea2 100644 --- a/manifests/v1beta1/components/mysql/pvc.yaml +++ b/manifests/v1beta1/components/mysql/pvc.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -8,4 +9,4 @@ spec: - ReadWriteOnce resources: requests: - storage: 2Gi + storage: 10Gi diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 83ef1888de2..8a3f41b7049 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -25,7 +25,7 @@ DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} DEPLOY_KFP=${4:-false} -E2E_TEST_IMAGE_TAG="v0.15.0" +E2E_TEST_IMAGE_TAG="e2e-test" TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" KFP_ENV=platform-agnostic-emissary @@ -51,12 +51,12 @@ fi # If the user wants to deploy Katib UI, then use the kustomization file for Katib UI. if ! "$DEPLOY_KATIB_UI"; then - index="$(yq -y '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)" - index="$index" yq -y -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE + index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)" + index="$index" yq eval -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE fi # Since e2e test doesn't need to large storage, we use a small PVC for Katib. -yq -y -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE +yq eval -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE echo -e "\n The Katib will be deployed with the following configs" cat $KUSTOMIZATION_FILE From 4cddd3e45816caffbe059c0d3fc4385ee7e37a69 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Thu, 20 Jul 2023 22:43:05 +0200 Subject: [PATCH 21/26] Adds spec of a simple kfp1+katib experiment spec This could be used for e2e testing --- .../katib-kfp-example-e2e-v1.yaml | 374 ++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml diff --git a/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml b/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml new file mode 100644 index 00000000000..18d6683825c --- /dev/null +++ b/examples/v1beta1/kubeflow-pipelines/katib-kfp-example-e2e-v1.yaml @@ -0,0 +1,374 @@ +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + name: katib-e2e-2023-07-20-22h-37m-57s + namespace: kubeflow +spec: + algorithm: + algorithmName: random + maxFailedTrialCount: 2 + maxTrialCount: 5 + metricsCollectorSpec: + collector: + customCollector: + args: + - -m + - val-accuracy;accuracy + - -s + - katib-db-manager.kubeflow:6789 + - -t + - $(PodName) + - -path + - /tmp/outputs/mlpipeline_metrics + env: + - name: PodName + valueFrom: + fieldRef: + fieldPath: metadata.name + image: docker.io/votti/kfpv1-metricscollector:v0.0.10 + imagePullPolicy: Always + name: custom-metrics-logger-and-collector + kind: Custom + source: + fileSystemPath: + kind: File + path: /tmp/outputs/mlpipeline_metrics/data + objective: + additionalMetricNames: + - accuracy + goal: 0.9 + objectiveMetricName: val-accuracy + type: maximize + parallelTrialCount: 5 + parameters: + - feasibleSpace: + max: '0.001' + min: '0.00001' + name: learning_rate + parameterType: double + - feasibleSpace: + max: '64' + min: '16' + name: batch_size + parameterType: int + - feasibleSpace: + list: + - '0' + - '1' + name: histogram_norm + parameterType: discrete + trialTemplate: + failureCondition: status.[@this].#(phase=="Failed")# + primaryContainerName: main + primaryPodLabels: + katib.kubeflow.org/model-training: 'true' + retain: false + successCondition: status.[@this].#(phase=="Succeeded")# + trialParameters: + - description: Learning rate for the training model + name: learningRate + reference: learning_rate + - description: Batch size for NN training + name: batchSize + reference: batch_size + - description: Histogram normalization of image on? + name: histogramNorm + reference: histogram_norm + trialSpec: + apiVersion: argoproj.io/v1alpha1 + kind: Workflow + metadata: + annotations: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline_compilation_time: '2023-07-20T22:37:57.355215' + pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"default": "0.0001", + "name": "lr", "optional": true, "type": "Float"}, {"default": "Adam", + "name": "optimizer", "optional": true, "type": "String"}, {"default": + "categorical_crossentropy", "name": "loss", "optional": true, "type": + "String"}, {"default": "3", "name": "epochs", "optional": true, "type": + "Integer"}, {"default": "5", "name": "batch_size", "optional": true, "type": + "Integer"}, {"default": "False", "name": "histogram_norm", "optional": + true, "type": "Boolean"}, {"default": "${trialParameters.learningRate}", + "name": "lr"}, {"default": "${trialParameters.batchSize}", "name": "batch_size"}, + {"default": "${trialParameters.histogramNorm}", "name": "histogram_norm"}], + "name": "Minimal KFP1 pipeline for e2e testing"}' + generateName: minimal-kfp1-pipeline-for-e2e-testing- + labels: + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + spec: + arguments: + parameters: + - name: lr + value: ${trialParameters.learningRate} + - name: optimizer + value: Adam + - name: loss + value: categorical_crossentropy + - name: epochs + value: '3' + - name: batch_size + value: ${trialParameters.batchSize} + - name: histogram_norm + value: ${trialParameters.histogramNorm} + entrypoint: minimal-kfp1-pipeline-for-e2e-testing + serviceAccountName: pipeline-runner + templates: + - dag: + tasks: + - arguments: + parameters: + - name: histogram_norm + value: '{{inputs.parameters.histogram_norm}}' + name: prep-e2e + template: prep-e2e + - arguments: + artifacts: + - from: '{{tasks.prep-e2e.outputs.artifacts.prep-e2e-output_nr}}' + name: prep-e2e-output_nr + parameters: + - name: batch_size + value: '{{inputs.parameters.batch_size}}' + - name: epochs + value: '{{inputs.parameters.epochs}}' + - name: loss + value: '{{inputs.parameters.loss}}' + - name: lr + value: '{{inputs.parameters.lr}}' + - name: optimizer + value: '{{inputs.parameters.optimizer}}' + dependencies: + - prep-e2e + name: train-e2e + template: train-e2e + inputs: + parameters: + - name: batch_size + - name: epochs + - name: histogram_norm + - name: loss + - name: lr + - name: optimizer + name: minimal-kfp1-pipeline-for-e2e-testing + - container: + args: + - --histogram-norm + - '{{inputs.parameters.histogram_norm}}' + - --output-nr + - /tmp/outputs/output_nr/data + command: + - sh + - -ec + - 'program_path=$(mktemp) + + printf "%s" "$0" > "$program_path" + + python3 -u "$program_path" "$@" + + ' + - "def _make_parent_dirs_and_return_path(file_path: str):\n import\ + \ os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n \ + \ return file_path\n\ndef prep_e2e(\n output_nr_path, # type:\ + \ ignore # noqa: F821\n histogram_norm = True,\n):\n with open(output_nr_path,\ + \ 'w') as writer:\n writer.write(str(int(histogram_norm)))\n\n\ + def _deserialize_bool(s) -> bool:\n from distutils.util import strtobool\n\ + \ return strtobool(s) == 1\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Prep\ + \ e2e', description='')\n_parser.add_argument(\"--histogram-norm\",\ + \ dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--output-nr\", dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = prep_e2e(**_parsed_args)\n" + image: python:3.7 + inputs: + parameters: + - name: histogram_norm + metadata: + annotations: + pipelines.kubeflow.org/arguments.parameters: '{"histogram_norm": "{{inputs.parameters.histogram_norm}}"}' + pipelines.kubeflow.org/component_ref: '{}' + pipelines.kubeflow.org/component_spec: '{"implementation": {"container": + {"args": [{"if": {"cond": {"isPresent": "histogram_norm"}, "then": + ["--histogram-norm", {"inputValue": "histogram_norm"}]}}, "--output-nr", + {"outputPath": "output_nr"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf + \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def _make_parent_dirs_and_return_path(file_path: str):\n import + os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return + file_path\n\ndef prep_e2e(\n output_nr_path, # type: ignore # + noqa: F821\n histogram_norm = True,\n):\n with open(output_nr_path, + ''w'') as writer:\n writer.write(str(int(histogram_norm)))\n\ndef + _deserialize_bool(s) -> bool:\n from distutils.util import strtobool\n return + strtobool(s) == 1\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Prep + e2e'', description='''')\n_parser.add_argument(\"--histogram-norm\", + dest=\"histogram_norm\", type=_deserialize_bool, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-nr\", + dest=\"output_nr_path\", type=_make_parent_dirs_and_return_path, required=True, + default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs + = prep_e2e(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": + [{"default": "True", "name": "histogram_norm", "optional": true, "type": + "Boolean"}], "name": "Prep e2e", "outputs": [{"name": "output_nr", + "type": "Integer"}]}' + pipelines.kubeflow.org/task_display_name: Prepare a dummy output that + should be cached + labels: + pipelines.kubeflow.org/cache_enabled: 'true' + pipelines.kubeflow.org/enable_caching: 'true' + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + name: prep-e2e + outputs: + artifacts: + - name: prep-e2e-output_nr + path: /tmp/outputs/output_nr/data + - container: + args: + - --input-nr + - /tmp/inputs/input_nr/data + - --lr + - '{{inputs.parameters.lr}}' + - --optimizer + - '{{inputs.parameters.optimizer}}' + - --loss + - '{{inputs.parameters.loss}}' + - --epochs + - '{{inputs.parameters.epochs}}' + - --batch-size + - '{{inputs.parameters.batch_size}}' + - --mlpipeline-metrics + - /tmp/outputs/mlpipeline_metrics/data + command: + - sh + - -ec + - 'program_path=$(mktemp) + + printf "%s" "$0" > "$program_path" + + python3 -u "$program_path" "$@" + + ' + - "def _make_parent_dirs_and_return_path(file_path: str):\n import\ + \ os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n \ + \ return file_path\n\ndef train_e2e(\n input_nr_path, # type:\ + \ ignore # noqa: F821\n mlpipeline_metrics_path, # type: ignore\ + \ # noqa: F821\n lr = 1e-4,\n optimizer = \"Adam\",\n loss\ + \ = \"categorical_crossentropy\",\n epochs = 1,\n batch_size =\ + \ 32,\n):\n \"\"\"\n This is the simulated train part of our ML\ + \ pipeline where training is performed\n \"\"\"\n import json\ + \ \n import time\n with open(input_nr_path, 'r') as reader:\n\ + \ line = reader.readline()\n histogram_norm_value = int(line)\n\ + \n accuracy = (batch_size + histogram_norm_value)/ (batch_size +\ + \ epochs+histogram_norm_value)\n val_accuracy = accuracy * 0.9\n\ + \ metrics = {\n \"metrics\": [\n {\n \ + \ \"name\": \"accuracy\", # The name of the metric. Visualized\ + \ as the column name in the runs table.\n \"numberValue\"\ + : accuracy, # The value of the metric. Must be a numeric value.\n \ + \ \"format\": \"PERCENTAGE\", # The optional format of\ + \ the metric. Supported values are \"RAW\" (displayed in raw format)\ + \ and \"PERCENTAGE\" (displayed in percentage format).\n \ + \ },\n {\n \"name\": \"val-accuracy\", #\ + \ The name of the metric. Visualized as the column name in the runs\ + \ table.\n \"numberValue\": val_accuracy, # The value\ + \ of the metric. Must be a numeric value.\n \"format\"\ + : \"PERCENTAGE\", # The optional format of the metric. Supported values\ + \ are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed\ + \ in percentage format).\n },\n ]\n }\n with\ + \ open(mlpipeline_metrics_path, \"w\") as f:\n json.dump(metrics,\ + \ f)\n\n # If this step is to fast, the metrics collector fails as\ + \ the\n # pod is already finished before it can collect the metrics.\n\ + \ time.sleep(10)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Train\ + \ e2e', description='This is the simulated train part of our ML pipeline\ + \ where training is performed')\n_parser.add_argument(\"--input-nr\"\ + , dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--lr\", dest=\"lr\", type=float, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--optimizer\",\ + \ dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--loss\", dest=\"loss\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--epochs\", dest=\"\ + epochs\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --batch-size\", dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--mlpipeline-metrics\", dest=\"mlpipeline_metrics_path\"\ + , type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n\ + _parsed_args = vars(_parser.parse_args())\n\n_outputs = train_e2e(**_parsed_args)\n" + image: python:3.7 + inputs: + artifacts: + - name: prep-e2e-output_nr + path: /tmp/inputs/input_nr/data + parameters: + - name: batch_size + - name: epochs + - name: loss + - name: lr + - name: optimizer + metadata: + annotations: + pipelines.kubeflow.org/arguments.parameters: '{"batch_size": "{{inputs.parameters.batch_size}}", + "epochs": "{{inputs.parameters.epochs}}", "loss": "{{inputs.parameters.loss}}", + "lr": "{{inputs.parameters.lr}}", "optimizer": "{{inputs.parameters.optimizer}}"}' + pipelines.kubeflow.org/component_ref: '{}' + pipelines.kubeflow.org/component_spec: '{"description": "This is the + simulated train part of our ML pipeline where training is performed", + "implementation": {"container": {"args": ["--input-nr", {"inputPath": + "input_nr"}, {"if": {"cond": {"isPresent": "lr"}, "then": ["--lr", + {"inputValue": "lr"}]}}, {"if": {"cond": {"isPresent": "optimizer"}, + "then": ["--optimizer", {"inputValue": "optimizer"}]}}, {"if": {"cond": + {"isPresent": "loss"}, "then": ["--loss", {"inputValue": "loss"}]}}, + {"if": {"cond": {"isPresent": "epochs"}, "then": ["--epochs", {"inputValue": + "epochs"}]}}, {"if": {"cond": {"isPresent": "batch_size"}, "then": + ["--batch-size", {"inputValue": "batch_size"}]}}, "--mlpipeline-metrics", + {"outputPath": "mlpipeline_metrics"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf + \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def _make_parent_dirs_and_return_path(file_path: str):\n import + os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return + file_path\n\ndef train_e2e(\n input_nr_path, # type: ignore # + noqa: F821\n mlpipeline_metrics_path, # type: ignore # noqa: F821\n lr + = 1e-4,\n optimizer = \"Adam\",\n loss = \"categorical_crossentropy\",\n epochs + = 1,\n batch_size = 32,\n):\n \"\"\"\n This is the simulated + train part of our ML pipeline where training is performed\n \"\"\"\n import + json \n import time\n with open(input_nr_path, ''r'') as reader:\n line + = reader.readline()\n histogram_norm_value = int(line)\n\n accuracy + = (batch_size + histogram_norm_value)/ (batch_size + epochs+histogram_norm_value)\n val_accuracy + = accuracy * 0.9\n metrics = {\n \"metrics\": [\n {\n \"name\": + \"accuracy\", # The name of the metric. Visualized as the column + name in the runs table.\n \"numberValue\": accuracy, # + The value of the metric. Must be a numeric value.\n \"format\": + \"PERCENTAGE\", # The optional format of the metric. Supported values + are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed + in percentage format).\n },\n {\n \"name\": + \"val-accuracy\", # The name of the metric. Visualized as the column + name in the runs table.\n \"numberValue\": val_accuracy, # + The value of the metric. Must be a numeric value.\n \"format\": + \"PERCENTAGE\", # The optional format of the metric. Supported values + are \"RAW\" (displayed in raw format) and \"PERCENTAGE\" (displayed + in percentage format).\n },\n ]\n }\n with + open(mlpipeline_metrics_path, \"w\") as f:\n json.dump(metrics, + f)\n\n # If this step is to fast, the metrics collector fails as + the\n # pod is already finished before it can collect the metrics.\n time.sleep(10)\n\nimport + argparse\n_parser = argparse.ArgumentParser(prog=''Train e2e'', description=''This + is the simulated train part of our ML pipeline where training is performed'')\n_parser.add_argument(\"--input-nr\", + dest=\"input_nr_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--lr\", + dest=\"lr\", type=float, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--optimizer\", + dest=\"optimizer\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--loss\", + dest=\"loss\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--epochs\", + dest=\"epochs\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--batch-size\", + dest=\"batch_size\", type=int, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--mlpipeline-metrics\", + dest=\"mlpipeline_metrics_path\", type=_make_parent_dirs_and_return_path, + required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs + = train_e2e(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": + [{"name": "input_nr", "type": "Integer"}, {"default": "0.0001", "name": + "lr", "optional": true, "type": "Float"}, {"default": "Adam", "name": + "optimizer", "optional": true, "type": "String"}, {"default": "categorical_crossentropy", + "name": "loss", "optional": true, "type": "String"}, {"default": "1", + "name": "epochs", "optional": true, "type": "Integer"}, {"default": + "32", "name": "batch_size", "optional": true, "type": "Integer"}], + "name": "Train e2e", "outputs": [{"name": "mlpipeline_metrics", "type": + "Metrics"}]}' + pipelines.kubeflow.org/max_cache_staleness: P0D + pipelines.kubeflow.org/task_display_name: Generate dummy metrics + labels: + katib.kubeflow.org/model-training: 'true' + pipelines.kubeflow.org/enable_caching: 'true' + pipelines.kubeflow.org/kfp_sdk_version: 1.8.12 + pipelines.kubeflow.org/pipeline-sdk-type: kfp + name: train-e2e + outputs: + artifacts: + - name: mlpipeline-metrics + path: /tmp/outputs/mlpipeline_metrics/data From 6a0bdd3b95ea453dcb7d1ea039174a51c0c514e1 Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Fri, 21 Jul 2023 08:40:43 +0200 Subject: [PATCH 22/26] Update psutil version to fix Docker build error --- .../v1beta1/kfp-metricscollector/v1/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt index fa4fc7d22b9..b73a43f3fba 100644 --- a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt +++ b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt @@ -1,4 +1,4 @@ -psutil==5.8.0 +psutil==5.9.4 rfc3339>=6.2 grpcio==1.41.1 googleapis-common-protos==1.6.0 From 182b78722ed02ae38ed9b6c7ef2176e9d24f9181 Mon Sep 17 00:00:00 2001 From: pre-commit fix Vito Zanotelli Date: Tue, 12 Sep 2023 22:19:05 +0200 Subject: [PATCH 23/26] Move kubeflow installation after katib Otherwise the patching of the `katib-controller` cluster role would not work. --- .../v1beta1/scripts/gh-actions/setup-katib.sh | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 8a3f41b7049..7997eed306f 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -68,17 +68,6 @@ if "$DEPLOY_TRAINING_OPERATOR"; then kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f - fi -# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines. -# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize -if "$DEPLOY_KFP"; then - echo "Deploying Kubeflow Pipelines version $KFP_VERSION" - kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}" - kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s - kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}" - kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s - #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80 - kubectl patch ClusterRole katib-controller -n kubeflow --type=json -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]' -fi echo "Deploying Katib" cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd - @@ -99,6 +88,19 @@ kubectl -n kubeflow get svc echo "Katib pods" kubectl -n kubeflow get pod +# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines. +# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize +if [ $DEPLOY_KFP ]; then + echo "Deploying Kubeflow Pipelines version $KFP_VERSION" + kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}" + kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s + kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}" + kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s + #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80 + kubectl patch ClusterRole katib-controller -n kubeflow --type=json -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]' + kubectl label namespace kubeflow katib.kubeflow.org/metrics-collector-injection=enabled +fi + # Check that Katib is working with 2 Experiments. kubectl apply -f ../../testdata/valid-experiment.yaml kubectl delete -f ../../testdata/valid-experiment.yaml From 9fc7c0215b84df9727053ff98036082ccc7877c2 Mon Sep 17 00:00:00 2001 From: pre-commit fix Vito Zanotelli Date: Tue, 12 Sep 2023 22:20:11 +0200 Subject: [PATCH 24/26] Parametrize kubeflow version This enables the user to set th version of the KFP version which should be useful to use this script to install KFP v1 and v2 without additional parameters. --- test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh index 7997eed306f..1fa62dc8f2d 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh @@ -23,6 +23,7 @@ cd "$(dirname "$0")" DEPLOY_KATIB_UI=${1:-false} DEPLOY_TRAINING_OPERATOR=${2:-false} WITH_DATABASE_TYPE=${3:-mysql} +# false or a specific KFP version (eg 1.8.1) DEPLOY_KFP=${4:-false} E2E_TEST_IMAGE_TAG="e2e-test" @@ -30,9 +31,6 @@ TRAINING_OPERATOR_VERSION="v1.6.0-rc.0" KFP_ENV=platform-agnostic-emissary KFP_BASE_URL="github.com/kubeflow/pipelines/manifests/kustomize" -# This is one of the latest KFPv1 version which was compatible with a -# recent K8s version at the time of writing (eg 1.8.22 gave an error). -KFP_VERSION="1.8.1" echo "Start to install Katib" @@ -91,6 +89,7 @@ kubectl -n kubeflow get pod # If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines. # found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize if [ $DEPLOY_KFP ]; then + KFP_VERSION="$DEPLOY_KFP" echo "Deploying Kubeflow Pipelines version $KFP_VERSION" kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}" kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s From 579546cd52a8851a10df2781244e9f740a9209f5 Mon Sep 17 00:00:00 2001 From: pre-commit fix Vito Zanotelli Date: Tue, 12 Sep 2023 22:22:37 +0200 Subject: [PATCH 25/26] Add `namespace` parameter This is required for kubeflow pipelines as I found no easy way to install kubeflow pipelines into the `default` workspace that was previously the hardcoded one. Now the namespace can be passed as a parameter. --- .../v1beta1/scripts/gh-actions/run-e2e-experiment.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh index 5a20faa6934..72ae394000f 100755 --- a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh +++ b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh @@ -15,7 +15,12 @@ # limitations under the License. # This shell script is used to run Katib Experiment. -# Input parameter - path to Experiment yaml. +# Input parameters +# - comma separated list of experiment names (exp1,exp2). +# For each experiment name, the script will search the folder +# `examples/v1beta1` for a file "{exp_name}.yaml" that will be +# executed as a katib experiment. Default: "" +# - namespace to execute experiment in. Default: default set -o errexit set -o nounset @@ -24,6 +29,7 @@ set -o pipefail cd "$(dirname "$0")" EXPERIMENT_FILES=${1:-""} IFS="," read -r -a EXPERIMENT_FILE_ARRAY <<< "$EXPERIMENT_FILES" +NAMESPACE=${2:-"default"} echo "Katib deployments" kubectl -n kubeflow get deploy @@ -44,7 +50,7 @@ fi for exp_name in "${EXPERIMENT_FILE_ARRAY[@]}"; do echo "Running Experiment from $exp_name file" exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml") - python run-e2e-experiment.py --experiment-path "${exp_path}" --namespace default \ + python run-e2e-experiment.py --experiment-path "${exp_path}" --namespace "${NAMESPACE}" \ --verbose || (kubectl get pods -n kubeflow && exit 1) done From 582a6a7dc83f353d5052469a6b0c0d8a8cb02725 Mon Sep 17 00:00:00 2001 From: pre-commit fix Vito Zanotelli Date: Tue, 12 Sep 2023 22:25:42 +0200 Subject: [PATCH 26/26] Add kfpv1 e2e test This action should now run the kubeflow pipeline v1 e2e example. This required the extension of the `template-e2e-test` to include parameters to a) install kfp b) select the `kubeflow` namespace (instead of default) to run the tests with. --- .github/workflows/e2e-test-kfpv1.yaml | 45 +++++++++++++++++++ .../workflows/template-e2e-test/action.yaml | 13 +++++- 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/e2e-test-kfpv1.yaml diff --git a/.github/workflows/e2e-test-kfpv1.yaml b/.github/workflows/e2e-test-kfpv1.yaml new file mode 100644 index 00000000000..52807f7cbaf --- /dev/null +++ b/.github/workflows/e2e-test-kfpv1.yaml @@ -0,0 +1,45 @@ +name: E2E Test with kubeflow pipelines v1 + +on: + pull_request: + paths-ignore: + - "pkg/new-ui/v1beta1/frontend/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +jobs: + e2e: + runs-on: ubuntu-20.04 + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Test Env + uses: ./.github/workflows/template-setup-e2e-test + with: + kubernetes-version: ${{ matrix.kubernetes-version }} + python-version: "3.10" + + - name: Run e2e test with ${{ matrix.experiments }} experiments + uses: ./.github/workflows/template-e2e-test + with: + experiments: ${{ matrix.experiments }} + training-operator: true + # Comma Delimited + trial-images: kfpv1-metrics-collector + install-kfp: 1.8.1 + experiment-namespace: kubeflow + + strategy: + fail-fast: false + matrix: + kubernetes-version: ["v1.23.13", "v1.24.7", "v1.25.3"] + # Comma Delimited + experiments: + - "katib-kfp-example-e2e-v1" diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml index ef1ca26064d..6337c8215bf 100644 --- a/.github/workflows/template-e2e-test/action.yaml +++ b/.github/workflows/template-e2e-test/action.yaml @@ -21,6 +21,15 @@ inputs: required: false description: mysql or postgres default: mysql + install-kfp: + required: false + description: whether kubeflow pipelines is required + as a dependency. If so provide version as string (eg 1.8.1) + default: false + experiment-namespace: + required: false + description: namespace to execute test experiment in + default: default runs: using: composite @@ -31,8 +40,8 @@ runs: - name: Setup Katib shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} + run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} ${{ inputs.install-kfp }} - name: Run E2E Experiment shell: bash - run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} + run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} ${{ inputs.experiment-namespace }}