kubeflow · votti · Feb 9, 2023 · Feb 10, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/.github/workflows/e2e-test-kfpv1.yaml b/.github/workflows/e2e-test-kfpv1.yaml
@@ -0,0 +1,45 @@
+name: E2E Test with kubeflow pipelines v1
+
+on:
+  pull_request:
+    paths-ignore:
+      - "pkg/new-ui/v1beta1/frontend/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+jobs:
+  e2e:
+    runs-on: ubuntu-20.04
+    timeout-minutes: 120
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup Test Env
+        uses: ./.github/workflows/template-setup-e2e-test
+        with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
+          python-version: "3.10"
+
+      - name: Run e2e test with ${{ matrix.experiments }} experiments
+        uses: ./.github/workflows/template-e2e-test
+        with:
+          experiments: ${{ matrix.experiments }}
+          training-operator: true
+          # Comma Delimited
+          trial-images: kfpv1-metrics-collector
+          install-kfp: 1.8.1
+          experiment-namespace: kubeflow
+
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.23.13", "v1.24.7", "v1.25.3"]
+        # Comma Delimited
+        experiments:
+          - "katib-kfp-example-e2e-v1"
diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml
@@ -32,3 +32,5 @@ jobs:
             dockerfile: cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile
           - component-name: tfevent-metrics-collector
             dockerfile: cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile
+          - component-name: kfpv1-metrics-collector
+            dockerfile: cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
diff --git a/.github/workflows/template-e2e-test/action.yaml b/.github/workflows/template-e2e-test/action.yaml
@@ -21,6 +21,15 @@ inputs:
     required: false
     description: mysql or postgres
     default: mysql
+  install-kfp:
+    required: false
+    description: whether kubeflow pipelines is required
+      as a dependency. If so provide version as string (eg 1.8.1)
+    default: false
+  experiment-namespace:
+    required: false
+    description: namespace to execute test experiment in
+    default: default
 
 runs:
   using: composite
@@ -31,8 +40,8 @@ runs:
 
     - name: Setup Katib
       shell: bash
-      run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }}
+      run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }} ${{ inputs.database-type }} ${{ inputs.install-kfp }}
 
     - name: Run E2E Experiment
       shell: bash
-      run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
+      run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }} ${{ inputs.experiment-namespace }}
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/Dockerfile
@@ -0,0 +1,24 @@
+FROM python:3.10-slim
+
+ARG TARGETARCH
+ENV TARGET_DIR /opt/katib
+ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/kfp-metricscollector/v1
+ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/kfp-metricscollector/v1::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/
+
+ADD ./pkg/ ${TARGET_DIR}/pkg/
+ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/
+
+WORKDIR  ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}
+
+RUN if [ "${TARGETARCH}" = "arm64" ]; then \
+    apt-get -y update && \
+    apt-get -y install gfortran libpcre3 libpcre3-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*; \
+    fi
+
+RUN pip install --no-cache-dir -r requirements.txt
+RUN chgrp -R 0 ${TARGET_DIR} \
+  && chmod -R g+rwX ${TARGET_DIR}
+
+ENTRYPOINT ["python", "main.py"]
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/main.py
@@ -0,0 +1,101 @@
+# Copyright 2023 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from logging import INFO, StreamHandler, getLogger
+
+import api_pb2
+import const
+import grpc
+from metrics_loader import MetricsCollector
+from pns import WaitMainProcesses
+
+timeout_in_seconds = 60
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(
+        description="KFP V1 MetricsCollector", add_help=True
+    )
+
+    # TODO (andreyvelich): Add early stopping flags.
+    parser.add_argument("-s-db", "--db_manager_server_addr", type=str, default="")
+    parser.add_argument("-t", "--pod_name", type=str, default="")
+    parser.add_argument(
+        "-path",
+        "--metrics_file_dir",
+        type=str,
+        default=const.DEFAULT_METRICS_FILE_KFPV1_DIR,
+    )
+    parser.add_argument("-m", "--metric_names", type=str, default="")
+    parser.add_argument("-o-type", "--objective_type", type=str, default="")
+    parser.add_argument("-f", "--metric_filters", type=str, default="")
+    parser.add_argument(
+        "-p", "--poll_interval", type=int, default=const.DEFAULT_POLL_INTERVAL
+    )
+    parser.add_argument(
+        "-timeout", "--timeout", type=int, default=const.DEFAULT_TIMEOUT
+    )
+    parser.add_argument(
+        "-w", "--wait_all_processes", type=str, default=const.DEFAULT_WAIT_ALL_PROCESSES
+    )
+    opt = parser.parse_args()
+    return opt
+
+
+if __name__ == "__main__":
+    logger = getLogger(__name__)
+    handler = StreamHandler()
+    handler.setLevel(INFO)
+    logger.setLevel(INFO)
+    logger.addHandler(handler)
+    logger.propagate = False
+    opt = parse_options()
+    wait_all_processes = opt.wait_all_processes.lower() == "true"
+    db_manager_server = opt.db_manager_server_addr.split(":")
+    trial_name = "-".join(opt.pod_name.split("-")[:-1])
+    if len(db_manager_server) != 2:
+        raise Exception(
+            "Invalid Katib DB manager service address: %s" % opt.db_manager_server_addr
+        )
+
+    WaitMainProcesses(
+        pool_interval=opt.poll_interval,
+        timout=opt.timeout,
+        wait_all=wait_all_processes,
+        completed_marked_dir=None,
 if completed_marked_dir: 
     mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
     # Check if file contains "completed" marker 
     with open(mark_file) as file_obj: 
         contents = file_obj.read() 
         if contents.strip() != const.TRAINING_COMPLETED: 
             raise Exception( 
                 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
                     const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
 if completed_marked_dir: 
     mark_file = os.path.join(completed_marked_dir, "{}.pid".format(pid)) 
     # Check if file contains "completed" marker 
     with open(mark_file) as file_obj: 
         contents = file_obj.read() 
         if contents.strip() != const.TRAINING_COMPLETED: 
             raise Exception( 
                 "Unable to find marker: {} in file: {} with contents: {} for pid: {}".format( 
                     const.TRAINING_COMPLETED, mark_file, contents, pid)) 
 # Add main pid to finished pids set 
+    )
+
+    mc = MetricsCollector(opt.metric_names.split(";"))
+    observation_log = mc.parse_file(opt.metrics_file_dir)
+
+    channel = grpc.beta.implementations.insecure_channel(
+        db_manager_server[0], int(db_manager_server[1])
+    )
+
+    with api_pb2.beta_create_DBManager_stub(channel) as client:
+        logger.info(
+            "In "
+            + trial_name
+            + " "
+            + str(len(observation_log.metric_logs))
+            + " metrics will be reported."
+        )
+        client.ReportObservationLog(
+            api_pb2.ReportObservationLogRequest(
+                trial_name=trial_name, observation_log=observation_log
+            ),
+            timeout=timeout_in_seconds,
+        )
diff --git a/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt b/cmd/metricscollector/v1beta1/kfp-metricscollector/v1/requirements.txt
@@ -0,0 +1,5 @@
+psutil==5.9.4
+rfc3339>=6.2
+grpcio==1.41.1
+googleapis-common-protos==1.6.0
+protobuf==3.20.0
diff --git a/examples/v1beta1/kubeflow-pipelines/README.md b/examples/v1beta1/kubeflow-pipelines/README.md
@@ -3,6 +3,10 @@
 The following examples show how to use Katib with
 [Kubeflow Pipelines](https://github.com/kubeflow/pipelines).
 
+Two different aspects are illustrated here:
+A) How to orchestrate Katib experiments from Kubeflow pipelines using the Katib Kubeflow Component (Example 1 & 2)
+B) How to use Katib to tune parameters of Kubeflow pipelines
+
 You can find the Katib Component source code for the Kubeflow Pipelines
 [here](https://github.com/kubeflow/pipelines/tree/master/components/kubeflow/katib-launcher).
 
@@ -13,6 +17,8 @@ You have to install the following Python SDK to run these examples:
 - [`kfp`](https://pypi.org/project/kfp/) >= 1.8.12
 - [`kubeflow-katib`](https://pypi.org/project/kubeflow-katib/) >= 0.13.0
 
+In order to run parameter tuning over Kubeflow pipelines, additionally Katib needs to be setup to run with Argo workflow tasks. The setup is described within the example notebook (3).
+
 ## Multi-User Pipelines Setup
 
 The Notebooks examples run Pipelines in multi-user mode and your Kubeflow Notebook
@@ -25,10 +31,12 @@ to give an access Kubeflow Notebook to run Kubeflow Pipelines.
 
 The following Pipelines are deployed from Kubeflow Notebook:
 
-- [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+1) [Kubeflow E2E MNIST](kubeflow-e2e-mnist.ipynb)
+
+2) [Katib Experiment with Early Stopping](early-stopping.ipynb)
 
-- [Katib Experiment with Early Stopping](early-stopping.ipynb)
+3) [Tune parameters of a `MNIST` kubeflow pipeline with Katib](kubeflow-kfpv1-opt-mnist.ipynb)
 
-The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI:
+The following Pipelines have to be compiled and uploaded to the Kubeflow Pipelines UI for examples 1 & 2:
 
 - [MPIJob Horovod](mpi-job-horovod.py)