kubeflow · votti · Feb 9, 2023 · Feb 10, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml
@@ -32,3 +32,5 @@ jobs:
             dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile
           - component-name: tfevent-metrics-collector
             dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile
+          - component-name: kfpv1-metrics-collector
+            dockerfile: cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
@@ -0,0 +1,24 @@
+FROM python:3.10-slim
+
+ARG TARGETARCH
+ENV TARGET_DIR /opt/katib
+ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfp-metricscollector/v1
+ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfp-metricscollector/v1::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/
+
+ADD ./pkg/ ${TARGET_DIR}/pkg/
+ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/
+
+WORKDIR  ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}
+
+RUN if [ "${TARGETARCH}" = "arm64" ]; then \
+    apt-get -y update && \
+    apt-get -y install gfortran libpcre3 libpcre3-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*; \
+    fi
+
+RUN pip install --no-cache-dir -r requirements.txt
+RUN chgrp -R 0 ${TARGET_DIR} \
+  && chmod -R g+rwX ${TARGET_DIR}
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py
@@ -0,0 +1,101 @@
+# Copyright 2023 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from logging import INFO, StreamHandler, getLogger
+
+import api_pb2
+import const
+import grpc
+from metrics_loader import MetricsCollector
+from pns import WaitMainProcesses
+
+timeout_in_seconds = 60
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(
+        description="KFP V1 MetricsCollector", add_help=True
+    )
+
+    # TODO (andreyvelich): Add early stopping flags.
+    parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="")
+    parser.add_argument("-t", "--pod_name", type=str, default="")
+    parser.add_argument(
+        "-path",
+        "--metrics_file_dir",
+        type=str,
+        default=const.DEFAULT_METRICS_FILE_KFPV1_DIR,
+    )
+    parser.add_argument("-m", "--metric_names", type=str, default="")
+    parser.add_argument("-o-type", "--objective_type", type=str, default="")
+    parser.add_argument("-f", "--metric_filters", type=str, default="")
+    parser.add_argument(
+        "-p", "--poll_interval", type=int, default=const.DEFAULT_POLL_INTERVAL
+    )
+    parser.add_argument(
+        "-timeout", "--timeout", type=int, default=const.DEFAULT_TIMEOUT
+    )
+    parser.add_argument(
+        "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES
+    )
+    opt = parser.parse_args()
+    return opt
+
+
+if __name__ == "__main__":
+    logger = getLogger(__name__)
+    handler = StreamHandler()
+    handler.setLevel(INFO)
+    logger.setLevel(INFO)
+    logger.addHandler(handler)
+    logger.propagate = False
+    opt = parse_options()
+    wait_all_processes = opt.wait_all_processes.lower() == "true"
+    db_manager_server = opt.db_manager_server_addr.split(":")
+    trial_name = "-".join(opt.pod_name.split("-")[:-1])
+    if len(db_manager_server) != 2:
+        raise Exception(
+            "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr
+        )
+
+    WaitMainProcesses(
+        pool_interval=opt.poll_interval,
+        timout=opt.timeout,
+        wait_all=wait_all_processes,
+        completed_marked_dir=None,
 if completed_marked_dir: 
     mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
     # Check if file contains "completed" marker 
     with open(mark_file) as file_obj: 
         contents = file_obj.read() 
         if contents.strip() != const.TRAINING_COMPLETED: 
             raise Exception( 
                 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
                     const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
 if completed_marked_dir: 
     mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
     # Check if file contains "completed" marker 
     with open(mark_file) as file_obj: 
         contents = file_obj.read() 
         if contents.strip() != const.TRAINING_COMPLETED: 
             raise Exception( 
                 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
                     const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
+    )
+
+    mc = MetricsCollector(opt.metric_names.split(";"))
+    observation_log = mc.parse_file(opt.metrics_file_dir)
+
+    channel = grpc.beta.implementations.insecure_channel(
+        db_manager_server[0], int(db_manager_server[1])
+    )
+
+    with api_pb2.beta_create_DBManager_stub(channel) as client:
+        logger.info(
+            "In "
+            + trial_name
+            + " "
+            + str(len(observation_log.metric_logs))
+            + " metrics will be reported."
+        )
+        client.ReportObservationLog(
+            api_pb2.ReportObservationLogRequest(
+                trial_name=trial_name, observation_log=observation_log
+            ),
+            timeout=timeout_in_seconds,
+        )
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt
@@ -0,0 +1,5 @@
+psutil==5.8.0
+rfc3339>=6.2
+grpcio==1.41.1
+googleapis-common-protos==1.6.0
+protobuf==3.20.0
diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md
@@ -3,6 +3,10 @@
 The following examples show how to use Katib with
 [Kubeflow Pipelines](https://github.com/kubeflow/pipelines).
 
+Two different aspects are illustrated here:
+A) How to orchestrate Katib experiments from Kubeflow pipelines using the Katib Kubeflow Component (Example 1 & 2)
+B) How to use Katib to tune parameters of Kubeflow pipelines
+
 You can find the Katib Component source code for the Kubeflow Pipelines
 [here](https://github.com/kubeflow/pipelines/tree/master/components/kubeflow/katib-launcher).
 
@@ -13,6 +17,8 @@ You have to install the following Python SDK to run these examples:
 - [`kfp`](https://pypi.org/project/kfp/) >= 1.8.12
 - [`kubeflow-katib`](https://pypi.org/project/kubeflow-katib/) >= 0.13.0
 
+In order to run parameter tuning over Kubeflow pipelines, additionally Katib needs to be setup to run with Argo workflow tasks. The setup is described within the example notebook (3).
+
 ## Multi-User Pipelines Setup
 
 The Notebooks examples run Pipelines in multi-user mode and your Kubeflow Notebook
@@ -25,10 +31,12 @@ to give an access Kubeflow Notebook to run Kubeflow Pipelines.
 
 The following Pipelines are deployed from Kubeflow Notebook:
 
-- [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+1) [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+
+2) [Katib Experiment with Early Stopping](early-stopping.ipynb)
 
-- [Katib Experiment with Early Stopping](early-stopping.ipynb)
+3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](kubeflow-kfpv1-opt-mnist.ipynb)
 
-The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI:
+The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2:
 
 - [MPIJob Horovod](mpi-job-horovod.py)
diff --git a/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb b/examples/v1beta1/kubeflow-pipelines/kubeflow-kfpv1-opt-mnist.ipynb
diff --git a/manifests/v1beta1/components/mysql/pvc.yaml b/manifests/v1beta1/components/mysql/pvc.yaml
@@ -1,4 +1,3 @@
----
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
@@ -9,4 +8,4 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 2Gi
diff --git a/pkg/metricscollector/v1beta1/common/const.py b/pkg/metricscollector/v1beta1/common/const.py
@@ -20,6 +20,8 @@
 DEFAULT_WAIT_ALL_PROCESSES = "True"
 # Default value for directory where TF event metrics are reported
 DEFAULT_METRICS_FILE_DIR = "/log"
+# Default value for directory where Kubeflow pipeline metrics are reported
+DEFAULT_METRICS_FILE_KFPV1_DIR = "/tmp/outputs/mlpipeline_metrics"
 # Job finished marker in $$$$.pid file when main process is completed
 TRAINING_COMPLETED = "completed"
 

diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/__init__.py
diff --git a/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py b/pkg/metricscollector/v1beta1/kfp-metricscollector/v1/metrics_loader.py
@@ -0,0 +1,110 @@
+# Copyright 2023 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The Kubeflow pipeline metrics collector KFPMetricParser parses the metrics file 
+# and returns an ObservationLog of the metrics specified.
+# Some documentation on the metrics collector file structure can be found here:
+# https://v0-6.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/
+
+from datetime import datetime
+from logging import getLogger, StreamHandler, INFO
+import os
+from typing import List
+import json
+
+import rfc3339
+import api_pb2
+from pkg.metricscollector.v1beta1.common import const
+
+class KFPMetricParser:
+    def __init__(self, metric_names):
+        self.metric_names = metric_names
+
+    @staticmethod
+    def find_all_files(directory):
+        for root, dirs, files in os.walk(directory):
+            for f in files:
+                yield os.path.join(root, f)
+
+    def parse_metrics(self, metric_file_path: str) -> List[api_pb2.MetricLog]:
+        """Parse a kubeflow pipeline metrics file
+
+        Args:
+            fn (function): path to metrics file
+
+        Returns:
+            List[api_pb2.MetricLog]: A list of logged metrics
+        """
+        metrics = []
+        with open(metric_file_path) as f:
+            metrics_dict = json.load(f)
+            for m in metrics_dict["metrics"]:
+                name = m["name"]
+                value = m["numberValue"]
+                if name in self.metric_names:
+                    ml = api_pb2.MetricLog(
+                        time_stamp=rfc3339.rfc3339(datetime.now()),
+                        metric=api_pb2.Metric(name=name, value=str(value)),
+                    )
+                    metrics.append(ml)
+        return metrics
+
+class MetricsCollector:
+    def __init__(self, metric_names):
+        self.logger = getLogger(__name__)
+        handler = StreamHandler()
+        handler.setLevel(INFO)
+        self.logger.setLevel(INFO)
+        self.logger.addHandler(handler)
+        self.logger.propagate = False
+        self.metrics = metric_names
+        self.parser = KFPMetricParser(metric_names)
+
+    def parse_file(self, directory):
+        """Parses the Kubeflow Pipeline metrics files"""
+        mls = []
+        for f in self.parser.find_all_files(directory):
+            if os.path.isdir(f):
+                continue
+            try:
+                self.logger.info(f + " will be parsed.")
+                mls.extend(self.parser.parse_metrics(f))
+            except Exception as e:
+                self.logger.warning("Unexpected error: " + str(e))
+                continue
+
+        # Metrics logs must contain at least one objective metric value
+        # Objective metric is located at first index
+        is_objective_metric_reported = False
+        for ml in mls:
+            if ml.metric.name == self.metrics[0]:
+                is_objective_metric_reported = True
+                break
+        # If objective metrics were not reported, insert unavailable value in the DB
+        if not is_objective_metric_reported:
+            mls = [
+                api_pb2.MetricLog(
+                    time_stamp=rfc3339.rfc3339(datetime.now()),
+                    metric=api_pb2.Metric(
+                        name=self.metrics[0], value=const.UNAVAILABLE_METRIC_VALUE
+                    ),
+                )
+            ]
+            self.logger.info(
+                "Objective metric {} is not found in metrics file, {} value is reported".format(
+                    self.metrics[0], const.UNAVAILABLE_METRIC_VALUE
+                )
+            )
+
+        return api_pb2.ObservationLog(metric_logs=mls)
diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh
@@ -71,6 +71,9 @@ docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/cert-generator:${
 echo -e "\nBuilding file metrics collector image...\n"
 docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile .
 
+echo -e "\nBuilding kfpv1 metrics collector image...\n"
+docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/kfpv1-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/kfp-metricscollector/v1/Dockerfile .
+
 echo -e "\nBuilding TF Event metrics collector image...\n"
 if [ "${ARCH}" == "ppc64le" ]; then
   docker buildx build --platform "linux/${ARCH}" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le .

diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh
@@ -50,6 +50,9 @@ docker push "${REGISTRY}/cert-generator:${TAG}"
 echo -e "\nPushing file metrics collector image...\n"
 docker push "${REGISTRY}/file-metrics-collector:${TAG}"
 
+echo -e "\nPushing kfpv1 metrics collector image...\n"
+docker push "${REGISTRY}/kfpv1-metrics-collector:${TAG}"
+
 echo -e "\nPushing TF Event metrics collector image...\n"
 docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}"
 

diff --git a/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh b/test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
@@ -23,10 +23,17 @@ cd "$(dirname "$0")"
 DEPLOY_KATIB_UI=${1:-false}
 DEPLOY_TRAINING_OPERATOR=${2:-false}
 WITH_DATABASE_TYPE=${3:-mysql}
+DEPLOY_KFP=${4:-false}
 
-E2E_TEST_IMAGE_TAG="e2e-test"
+E2E_TEST_IMAGE_TAG="v0.15.0"
 TRAINING_OPERATOR_VERSION="v1.6.0-rc.0"
 
+KFP_ENV=platform-agnostic-emissary
+KFP_BASE_URL="github.com/kubeflow/pipelines/manifests/kustomize"
+# This is one of the latest KFPv1 version which was compatible with a
+# recent K8s version at the time of writing (eg 1.8.22 gave an error).
+KFP_VERSION="1.8.1"
+
 echo "Start to install Katib"
 
 # Update Katib images with `e2e-test`.
@@ -44,12 +51,12 @@ fi
 
 # If the user wants to deploy Katib UI, then use the kustomization file for Katib UI.
 if ! "$DEPLOY_KATIB_UI"; then
-  index="$(yq eval '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)"
-  index="$index" yq eval -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE
+  index="$(yq -y '.resources.[] | select(. == "../../components/ui/") | path | .[-1]' $KUSTOMIZATION_FILE)"
+  index="$index" yq -y -i 'del(.resources.[env(index)])' $KUSTOMIZATION_FILE
 fi
 
 # Since e2e test doesn't need to large storage, we use a small PVC for Katib.
-yq eval -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE
+yq -y -i '.spec.resources.requests.storage|="2Gi"' $PVC_FILE
 
 echo -e "\n The Katib will be deployed with the following configs"
 cat $KUSTOMIZATION_FILE
@@ -61,6 +68,18 @@ if "$DEPLOY_TRAINING_OPERATOR"; then
   kustomize build "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION" | kubectl apply -f -
 fi
 
+# If the user wants to deploy kubeflow pipelines, then use the kustomization file for kubeflow pipelines.
+# found at: https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize
+if "$DEPLOY_KFP"; then
+  echo "Deploying Kubeflow Pipelines version $KFP_VERSION"
+  kubectl apply -k "${KFP_BASE_URL}/cluster-scoped-resources/?ref=${KFP_VERSION}"
+  kubectl wait crd/applications.app.k8s.io --for condition=established --timeout=60s
+  kubectl apply -k "${KFP_BASE_URL}/env/${KFP_ENV}/?ref=${KFP_VERSION}"
+  kubectl wait pods -l application-crd-id=kubeflow-pipelines -n kubeflow --for condition=Ready --timeout=1800s
+  #kubectl port-forward -n kubeflow svc/ml-pipeline-ui 8080:80
+  kubectl patch ClusterRole katib-controller -n kubeflow --type=json   -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["get", "list", "watch", "create", "delete"]}}]'
+fi
+
 echo "Deploying Katib"
 cd ../../../../../ && WITH_DATABASE_TYPE=$WITH_DATABASE_TYPE make deploy && cd -