kubeflow · prakhar479 · Aug 9, 2024 · Aug 13, 2024 · Aug 15, 2024 · Aug 17, 2024
diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -188,7 +188,10 @@ def tune(
         retain_trials: bool = False,
         packages_to_install: List[str] = None,
         pip_index_url: str = "https://pypi.org/simple",
-        metrics_collector_config: Dict[str, Any] = {"kind": "StdOut"},
+        metrics_collector_config: Dict[str, Any] = {
+            "kind": "StdOut", 
+            "custom_collector": None
+        },
     ):
         """Create HyperParameter Tuning Katib Experiment from the objective function.
 
@@ -253,9 +256,18 @@ def tune(
                 to the base image packages. These packages are installed before
                 executing the objective function.
             pip_index_url: The PyPI url from which to install Python packages.
-            metrics_collector_config: Specify the config of metrics collector,
-                for example, `metrics_collector_config = {"kind": "Push"}`.
-                Currently, we only support `StdOut` and `Push` metrics collector.
+
+            `metrics_collector_config`: Specify the configuration for the metrics collector with following keys:
+            - **kind**: Specify the kind of Metrics Collector. Currently supported values are:
+                - `StdOut`: Collects metrics from standard output.
+                - `None`: No metrics collection.
+                - `File`: Writes metrics to a file.
+                - `TensorFlowEvent`: Collects metrics in TensorFlow Event format.
+                - `PrometheusMetric`: Exposes metrics in a Prometheus-compatible format.
+                - `Custom`: For custom metrics collection. Use the "custom_collector" key to specify the collector instance.
+
+            - **custom_collector**: If the `kind` is set to `Custom`, you must provide an instance of a custom `V1Container` as the value. For example:
+                `metrics_collector_config = {"kind" : "Custom", "custom_collector": <Instance of V1Container>}`.
 
         Raises:
             ValueError: Function arguments have incorrect type or value.
@@ -396,7 +408,10 @@ def tune(
         # Up to now, we only support parameter `kind`, of which default value
         # is `StdOut`, to specify the kind of metrics collector.
         experiment.spec.metrics_collector_spec = models.V1beta1MetricsCollectorSpec(
-            collector=models.V1beta1CollectorSpec(kind=metrics_collector_config["kind"])
+            collector=models.V1beta1CollectorSpec(  
+                kind=metrics_collector_config["kind"],
+                custom_collector=metrics_collector_config["custom_collector"],
+            )
         )
 
         # Create Trial specification.

diff --git a/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector b/test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector
@@ -0,0 +1,9 @@
+FROM python:3.8-slim
+
+WORKDIR /app
+
+COPY test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py .
+
+RUN pip install kubernetes
+
+CMD ["python", "dummy-collector.py"]
diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh
@@ -167,6 +167,9 @@ done
 if "$TUNE_API"; then
   echo -e "\nPulling and building testing image for tune function..."
   _build_containers "suggestion-hyperopt" "$CMD_PREFIX/suggestion/hyperopt/$VERSION/Dockerfile"
+
+  echo -e "\nBuilding dummy collector image..."
+  _build_containers "dummy-collector" "test/e2e/v1beta1/scripts/gh-actions/Dockerfile.dummy-collector"
 fi
 
 echo -e "\nCleanup Build Cache...\n"

diff --git a/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py b/test/e2e/v1beta1/scripts/gh-actions/dummy-collector.py
@@ -0,0 +1,27 @@
+import argparse
+import logging
+import time
+
+from kubernetes import client, config
+
+# The default logging config.
+logging.basicConfig(level=logging.INFO)
+
+def collect_metrics(metric_name : str):
+    config.load_incluster_config()
+    v1 = client.CoreV1Api()
+
+    while True:
+        dummy_metric_value = 42 
+        logging.info(f"Collected dummy metric: {metric_name}={dummy_metric_value}")
+
+        time.sleep(10)  # Collect metrics every 10 seconds
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metric-name", type=str, required=True, help="Name of the metric to collect")
+    args = parser.parse_args()
+
+    collect_metrics(args.metric_name)
+
+
diff --git a/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py b/test/e2e/v1beta1/scripts/gh-actions/run-e2e-tune-api.py
@@ -3,6 +3,7 @@
 
 from kubeflow.katib import KatibClient, search
 from kubernetes import client
+from kubernetes.client import V1Container
 from verify import verify_experiment_results
 
 # Experiment timeout is 40 min.
@@ -11,8 +12,68 @@
 # The default logging config.
 logging.basicConfig(level=logging.INFO)
 
+def run_e2e_experiment_create_by_tune_custom_metrics_collector(
+    katib_client: KatibClient,
+    exp_name: str,
+    exp_namespace: str,
+):
+    # Create Katib Experiment and wait until it is finished.
+    logging.debug("Creating Experiment: {}/{}".format(exp_namespace, exp_name))
+
+    # Use the test case from get-started tutorial.
+    # https://www.kubeflow.org/docs/components/katib/getting-started/#getting-started-with-katib-python-sdk
+    # [1] Create an objective function.
+    def objective(parameters):
+        import time
+        time.sleep(5)
+        result = 4 * int(parameters["a"]) - float(parameters["b"]) ** 2
+        print(f"result={result}")
+
+    # [2] Create hyperparameter search space.
+    parameters = {
+        "a": search.int(min=10, max=20),
+        "b": search.double(min=0.1, max=0.2)
+    }
+
+    # [3] Create a dummy metric collector (DOES NOT HAVE A IMAGE)
+    metric_collector = V1Container(
+        name="dummy-collector",
+        image="dummy-collector:latest",
+        command=["python", "/app/dummy-collector.py"],
+        args=["--metric-name=result"],
+        env=[
+            client.V1EnvVar(name="EXPERIMENT_NAME", value=exp_name),
+            client.V1EnvVar(name="EXPERIMENT_NAMESPACE", value=exp_namespace)
+        ]
+    )
+
+    # [4] Create Katib Experiment with 4 Trials and 2 CPUs per Trial.
+    # And Wait until Experiment reaches Succeeded condition.
+    katib_client.tune(
+        name=exp_name,
+        namespace=exp_namespace,
+        objective=objective,
+        parameters=parameters,
+        objective_metric_name="result",
+        max_trial_count=4,
+        resources_per_trial={"cpu": "2"},
+        metrics_collector_config={
+            "kind": "Custom",
+            "custom_collector": metric_collector,
+        },
+    )
+    experiment = katib_client.wait_for_experiment_condition(
+        exp_name, exp_namespace, timeout=EXPERIMENT_TIMEOUT
+    )
+
+    # Verify the Experiment results.
+    verify_experiment_results(katib_client, experiment, exp_name, exp_namespace)
+
+    # Print the Experiment and Suggestion.
+    logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
+    logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
 
-def run_e2e_experiment_create_by_tune(
+def run_e2e_experiment_create_by_tune_default_metrics_collector(
     katib_client: KatibClient,
     exp_name: str,
     exp_namespace: str,
@@ -57,7 +118,6 @@ def objective(parameters):
     logging.debug(katib_client.get_experiment(exp_name, exp_namespace))
     logging.debug(katib_client.get_suggestion(exp_name, exp_namespace))
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -78,11 +138,12 @@ def objective(parameters):
         namespace_labels['katib.kubeflow.org/metrics-collector-injection'] = 'enabled'
         client.CoreV1Api().patch_namespace(args.namespace, {'metadata': {'labels': namespace_labels}})
 
-    # Test with run_e2e_experiment_create_by_tune
-    exp_name = "tune-example"
+    # Test with run_e2e_experiment_create_by_tune_default_metrics_collector 
     exp_namespace = args.namespace
     try:
-        run_e2e_experiment_create_by_tune(katib_client, exp_name, exp_namespace)
+        exp_name = "tune-example-default-metrics-collector"
+        logging.info(f"Runnning E2E for Experiment created by tune: {exp_namespace}/{exp_name}")
+        run_e2e_experiment_create_by_tune_default_metrics_collector(katib_client, exp_name, exp_namespace)
         logging.info("---------------------------------------------------------------")
         logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
     except Exception as e:
@@ -94,3 +155,21 @@ def objective(parameters):
         logging.info("---------------------------------------------------------------")
         logging.info("---------------------------------------------------------------")
         katib_client.delete_experiment(exp_name, exp_namespace)
+
+
+    # Test with run_e2e_experiment_create_by_tune_custom_metrics_collector
+    try:
+        exp_name = "tune-example-custom-metrics-collector"
+        logging.info(f"Runnning E2E for Experiment created by tune: {exp_namespace}/{exp_name}")
+        run_e2e_experiment_create_by_tune_custom_metrics_collector(katib_client, exp_name, exp_namespace)
+        logging.info("---------------------------------------------------------------")
+        logging.info(f"E2E is succeeded for Experiment created by tune: {exp_namespace}/{exp_name}")
+    except Exception as e:
+        logging.info("---------------------------------------------------------------")
+        logging.info(f"E2E is failed for Experiment created by tune: {exp_namespace}/{exp_name}")
+        raise e
+    finally:
+        # Delete the Experiment.
+        logging.info("---------------------------------------------------------------")
+        logging.info("---------------------------------------------------------------")
+        katib_client.delete_experiment(exp_name, exp_namespace)