From 1cee08834d53c91696373ed8f8d367139d934167 Mon Sep 17 00:00:00 2001
From: Googler <nobody@google.com>
Date: Fri, 7 Jun 2024 15:33:19 -0700
Subject: [PATCH] feat(components): Add role_field_name and model_name as input
 parameters to llm_evaluation_preprocessor component to support gemini model's
 input and output schema

Signed-off-by: Googler <nobody@google.com>
PiperOrigin-RevId: 641377116
---
 .../src/v2/compiler/argocompiler/container.go | 55 +++++++++++++++++++
 .../llm_evaluation_preprocessor/component.py  | 14 +++++
 ...evaluation_llm_text_generation_pipeline.py |  4 ++
 3 files changed, 73 insertions(+)

diff --git a/backend/src/v2/compiler/argocompiler/container.go b/backend/src/v2/compiler/argocompiler/container.go
index 0dbb38a1117a..d412c08b59b0 100644
--- a/backend/src/v2/compiler/argocompiler/container.go
+++ b/backend/src/v2/compiler/argocompiler/container.go
@@ -15,7 +15,9 @@
 package argocompiler
 
 import (
+	"fmt"
 	"os"
+	"strings"
 
 	wfapi "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
 	"github.com/golang/protobuf/jsonpb"
@@ -361,6 +363,59 @@ func (c *workflowCompiler) addContainerExecutorTemplate(refName string) string {
 			extendPodMetadata(&executor.Metadata, k8sExecCfg)
 		}
 	}
+	caBundleCfgMapName := os.Getenv("EXECUTOR_CABUNDLE_CONFIGMAP_NAME")
+	caBundleCfgMapKey := os.Getenv("EXECUTOR_CABUNDLE_CONFIGMAP_KEY")
+	caBundleMountPath := os.Getenv("EXECUTOR_CABUNDLE_MOUNTPATH")
+	if caBundleCfgMapName != "" && caBundleCfgMapKey != "" {
+		caFile := fmt.Sprintf("%s/%s", caBundleMountPath, caBundleCfgMapKey)
+		var certDirectories = []string{
+			caBundleMountPath,
+			"/etc/ssl/certs",
+			"/etc/pki/tls/certs",
+		}
+		// Add to REQUESTS_CA_BUNDLE for python request library.
+		executor.Container.Env = append(executor.Container.Env, k8score.EnvVar{
+			Name:  "REQUESTS_CA_BUNDLE",
+			Value: caFile,
+		})
+		// For AWS utilities like cli, and packages.
+		executor.Container.Env = append(executor.Container.Env, k8score.EnvVar{
+			Name:  "AWS_CA_BUNDLE",
+			Value: caFile,
+		})
+		// OpenSSL default cert file env variable.
+		// https://www.openssl.org/docs/man1.1.1/man3/SSL_CTX_set_default_verify_paths.html
+		executor.Container.Env = append(executor.Container.Env, k8score.EnvVar{
+			Name:  "SSL_CERT_FILE",
+			Value: caFile,
+		})
+		sslCertDir := strings.Join(certDirectories, ":")
+		executor.Container.Env = append(executor.Container.Env, k8score.EnvVar{
+			Name:  "SSL_CERT_DIR",
+			Value: sslCertDir,
+		})
+		volume := k8score.Volume{
+			Name: volumeNameCABUndle,
+			VolumeSource: k8score.VolumeSource{
+				ConfigMap: &k8score.ConfigMapVolumeSource{
+					LocalObjectReference: k8score.LocalObjectReference{
+						Name: caBundleCfgMapName,
+					},
+				},
+			},
+		}
+
+		executor.Volumes = append(executor.Volumes, volume)
+
+		volumeMount := k8score.VolumeMount{
+			Name:      volumeNameCABUndle,
+			MountPath: caFile,
+			SubPath:   caBundleCfgMapKey,
+		}
+
+		executor.Container.VolumeMounts = append(executor.Container.VolumeMounts, volumeMount)
+
+	}
 	c.templates[nameContainerImpl] = executor
 	c.wf.Spec.Templates = append(c.wf.Spec.Templates, *container, *executor)
 	return nameContainerExecutor
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py
index 4576a1875b12..5c2b6f2e2da7 100644
--- a/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py
+++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py
@@ -38,6 +38,8 @@ def evaluation_dataset_preprocessor_internal(
     output_dirs: dsl.OutputPath(list),
     gcp_resources: dsl.OutputPath(str),
     input_field_name: str = 'input_text',
+    role_field_name: str = 'role',
+    model_name: str = 'publishers/google/model/text-bison@002',
     display_name: str = 'llm_evaluation_dataset_preprocessor_component',
     machine_type: str = 'e2-highmem-16',
     service_account: str = '',
@@ -56,6 +58,9 @@ def evaluation_dataset_preprocessor_internal(
       gcs_source_uris: A json escaped list of GCS URIs of the input eval dataset.
       input_field_name: The field name of the input eval dataset instances that
         contains the input prompts to the LLM.
+      role_field_name: The field name of the role for input eval dataset instances
+        that contains the input prompts to the LLM.
+      model_name: Name of the model being used to create model-specific schemas.
       machine_type: The machine type of this custom job. If not set, defaulted
         to `e2-highmem-16`. More details:
         https://cloud.google.com/compute/docs/machine-resource
@@ -92,6 +97,8 @@ def evaluation_dataset_preprocessor_internal(
               f'--eval_dataset_preprocessor={True}',
               f'--gcs_source_uris={gcs_source_uris}',
               f'--input_field_name={input_field_name}',
+              f'--role_field_name={role_field_name}',
+              f'--model_name={model_name}',
               f'--output_dirs={output_dirs}',
               '--executor_input={{$.json_escape[1]}}',
           ],
@@ -109,6 +116,8 @@ def llm_evaluation_dataset_preprocessor_graph_component(
     location: str,
     gcs_source_uris: List[str],
     input_field_name: str = 'input_text',
+    role_field_name: str = 'role',
+    model_name: str = 'publishers/google/model/text-bison@002',
     display_name: str = 'llm_evaluation_dataset_preprocessor_component',
     machine_type: str = 'e2-standard-4',
     service_account: str = '',
@@ -126,6 +135,9 @@ def llm_evaluation_dataset_preprocessor_graph_component(
       gcs_source_uris: A list of GCS URIs of the input eval dataset.
       input_field_name: The field name of the input eval dataset instances that
         contains the input prompts to the LLM.
+      role_field_name: The field name of the role for input eval dataset
+        instances that contains the input prompts to the LLM.
+      model_name: Name of the model being used to create model-specific schemas.
       display_name: The name of the Evaluation job.
       machine_type: The machine type of this custom job. If not set, defaulted
         to `e2-standard-4`. More details:
@@ -163,6 +175,8 @@ def llm_evaluation_dataset_preprocessor_graph_component(
           input_list=gcs_source_uris
       ).output,
       input_field_name=input_field_name,
+      role_field_name=role_field_name,
+      model_name=model_name,
       display_name=display_name,
       machine_type=machine_type,
       service_account=service_account,
diff --git a/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py b/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py
index a678d6cfbdc2..534e3afde0a1 100644
--- a/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py
+++ b/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py
@@ -38,6 +38,7 @@ def evaluation_llm_text_generation_pipeline(  # pylint: disable=dangerous-defaul
     batch_predict_gcs_destination_output_uri: str,
     model_name: str = 'publishers/google/models/text-bison@002',
     evaluation_task: str = 'text-generation',
+    role_field_name: str = 'role',
     input_field_name: str = 'input_text',
     target_field_name: str = 'output_text',
     batch_predict_instances_format: str = 'jsonl',
@@ -76,6 +77,7 @@ def evaluation_llm_text_generation_pipeline(  # pylint: disable=dangerous-defaul
     batch_predict_gcs_destination_output_uri: Required. The Google Cloud Storage location of the directory where the eval pipeline output is to be written to.
     model_name: The Model name used to run evaluation. Must be a publisher Model or a managed Model sharing the same ancestor location. Starting this job has no impact on any existing deployments of the Model and their resources.
     evaluation_task: The task that the large language model will be evaluated on. The evaluation component computes a set of metrics relevant to that specific task. Currently supported tasks are: `summarization`, `question-answering`, `text-generation`.
+    role_field_name: The field name of the role for input eval dataset instances that contains the input prompts to the LLM.
     input_field_name: The field name of the input eval dataset instances that contains the input prompts to the LLM.
     target_field_name: The field name of the eval dataset instance that contains an example reference text response. Alternatively referred to as the ground truth (or ground_truth_column) field. If not set, defaulted to `output_text`.
     batch_predict_instances_format: The format in which instances are given, must be one of the Model's supportedInputStorageFormats. Only "jsonl" is currently supported. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.
@@ -124,6 +126,8 @@ def evaluation_llm_text_generation_pipeline(  # pylint: disable=dangerous-defaul
       location=location,
       gcs_source_uris=batch_predict_gcs_source_uris,
       input_field_name=input_field_name,
+      role_field_name=role_field_name,
+      model_name=model_name,
       machine_type=machine_type,
       service_account=service_account,
       network=network,