From 30f1f6aec2527cad0d8ed3fff5ed211608e8244a Mon Sep 17 00:00:00 2001
From: Googler <nobody@google.com>
Date: Fri, 5 Jan 2024 16:55:32 -0800
Subject: [PATCH] feat(components): Implement the train time evaluation in
 reward model training. With the train time eval dataset available, the
 pipeline outputs the accuracy and cross entropy metrics to the log

PiperOrigin-RevId: 596114293
---
 components/google-cloud/RELEASE.md            |  1 +
 .../_implementation/llm/function_based.py     | 49 +++++++++++--------
 .../llm/generated/refined_image_versions.py   |  2 +-
 .../_implementation/llm/reward_model_graph.py | 23 +++++++++
 .../llm/reward_model_trainer.py               |  4 ++
 .../preview/llm/rlhf/component.py             |  8 ++-
 6 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/components/google-cloud/RELEASE.md b/components/google-cloud/RELEASE.md
index 8bedf1aeeb39..6db540d0fded 100644
--- a/components/google-cloud/RELEASE.md
+++ b/components/google-cloud/RELEASE.md
@@ -2,6 +2,7 @@
 * Add `v1.automl.forecasting.learn_to_learn_forecasting_pipeline`, `v1.automl.forecasting.sequence_to_sequence_forecasting_pipeline`, `v1.automl.forecasting.temporal_fusion_transformer_forecasting_pipeline`, `v1.automl.forecasting.time_series_dense_encoder_forecasting_pipeline` as Forecasting on Pipelines moves to GA.
 * Fix bug in `preview.llm.rlhf_pipeline` that caused wrong output artifact to be used for inference after training.
 * Fix issue where AutoSxS was not propagating location to all sub-components.
+* Use `eval_dataset` for train-time evalutation when training a reward model. Requires `eval_dataset` to contain the same fields as the [preference dataset](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-rlhf#human-preference-dataset).
 
 ## Release 2.10.0
 * Fix the missing output of pipeline remote runner. `AutoMLImageTrainingJobRunOp` now passes the model artifacts correctly to downstream components.
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py
index 446c478f0c75..a7f5c7bd4fce 100644
--- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py
+++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/function_based.py
@@ -573,25 +573,32 @@ def get_empty_string() -> str:
 def validate_rlhf_inputs(
     large_model_reference: str,
     eval_dataset: Optional[str] = None,
-) -> None:
+) -> str:
   """Checks user-provided arguments are valid for the RLHF pipeline."""
-  models_that_support_bulk_inference = {
-      't5-small',
-      't5-large',
-      't5-xl',
-      't5-xxl',
-      'llama-2-7b',
-      'llama-2-7b-chat',
-      'llama-2-13b',
-      'llama-2-13b-chat',
-  }
-  if (
-      eval_dataset
-      and large_model_reference not in models_that_support_bulk_inference
-  ):
-    raise ValueError(
-        f'eval_dataset not supported for {large_model_reference}. '
-        'Please set this value to None when tuning this model. '
-        'This model can be evaluated after tuning using Batch or Online '
-        'Prediction.'
-    )
+  import json
+  import re
+  import glob
+
+  eval_dataset = eval_dataset or ''
+  gcs_eval_dataset_uri = re.sub('^gs://', '/gcs/', eval_dataset)
+  files_in_the_folder = glob.glob(gcs_eval_dataset_uri)
+  if not files_in_the_folder:
+    return ''
+  one_file = files_in_the_folder[0]
+  required_fields = ('input_text', 'candidate_0', 'candidate_1', 'choice')
+  is_valid_preference_data = True
+  remaining_lines_to_check = 100
+  empty_eval_dataset_for_reward_model = ''
+  with open(one_file, 'r') as inputs:
+    for line in inputs:
+      json_data = json.loads(line)
+      remaining_lines_to_check -= 1
+      is_valid_preference_data = is_valid_preference_data & all(
+          field in json_data for field in required_fields
+      )
+      if not is_valid_preference_data:
+        return empty_eval_dataset_for_reward_model
+      if remaining_lines_to_check == 0:
+        break
+
+  return eval_dataset
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py
index 01c853c87b4a..2ce814ba193b 100644
--- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py
+++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py
@@ -17,4 +17,4 @@
 DO NOT EDIT - This file is generated, manual changes will be overridden.
 """
 
-IMAGE_TAG = '20240303_0507_RC00'
+IMAGE_TAG = '20240305_0507_RC00'
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py
index edbd4ccae649..52e822616721 100644
--- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py
+++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py
@@ -45,6 +45,7 @@ def pipeline(
     lora_dim: int = 4,
     reward_model_learning_rate_multiplier: float = 1.0,
     reward_model_train_steps: int = 1000,
+    eval_dataset: Optional[str] = None,
     instruction: Optional[str] = None,
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
     location: str = _placeholders.LOCATION_PLACEHOLDER,
@@ -119,6 +120,25 @@ def pipeline(
       .set_caching_options(False)
   )
 
+  preference_eval_dataset_importer = (
+      private_text_comparison_importer.private_text_comparison_importer(
+          project=project,
+          location=location,
+          input_text=eval_dataset,
+          inputs_field_name=prompt_column,
+          comma_separated_candidates_field_names=comma_separated_candidates_field_names.output,
+          choice_field_name=choice_column,
+          split=env.TRAIN_SPLIT,
+          large_model_reference=reference_model_metadata.outputs[
+              'reward_model_reference'
+          ],
+          instruction=instruction,
+          encryption_spec_key_name=encryption_spec_key_name,
+      )
+      .set_display_name('Import Preference Eval Dataset')
+      .set_caching_options(False)
+  )
+
   reward_model_image_uri = function_based.resolve_private_refined_image_uri(
       accelerator_type=machine_spec.outputs['accelerator_type'],
   ).set_display_name('Resolve Reward Model Image URI')
@@ -137,6 +157,9 @@ def pipeline(
           input_dataset_path=preference_dataset_importer.outputs[
               'output_dataset_path'
           ],
+          eval_dataset_path=preference_eval_dataset_importer.outputs[
+              'output_dataset_path'
+          ],
           train_steps=reward_model_train_steps,
           accelerator_type=machine_spec.outputs['accelerator_type'],
           accelerator_count=machine_spec.outputs['accelerator_count'],
diff --git a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py
index d26bb2c486dc..69a3f912edbc 100644
--- a/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py
+++ b/components/google-cloud/google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py
@@ -35,6 +35,7 @@ def reward_model_trainer(
     output_adapter_path: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
     tensorboard_metrics: kfp.dsl.Output[kfp.dsl.Artifact],  # pytype: disable=unsupported-operands
     gcp_resources: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
+    eval_dataset_path: str = '',
     train_split: str = 'train',
     batch_size: int = 64,
     learning_rate_multiplier: float = 1.0,
@@ -49,6 +50,8 @@ def reward_model_trainer(
     location: Location used to run the job.
     input_model_path: Path to the base model to fine tune.
     input_dataset_path: Path to dataset to use to train a reward model.
+    eval_dataset_path: Path to eval dataset to use during the reward model
+      training.
     train_steps: Number of training steps. These are the number of steps on top
       of any steps used to train the base model.
     accelerator_type: Type of TPU accelerator. Can be either TPU_V2 or TPU_V3.
@@ -94,6 +97,7 @@ def reward_model_trainer(
               f'--train_steps={train_steps}',
               f'--input_model_path={input_model_path}',
               f'--input_dataset_path={input_dataset_path}',
+              f'--eval_dataset_path={eval_dataset_path}',
               f'--output_adapter_path={output_adapter_path}',
               f'--tensorboard_metrics_path={tensorboard_metrics.path}',
               f'--large_model_reference={large_model_reference}',
diff --git a/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py b/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py
index a62ea3c3595a..870dd131c1e1 100644
--- a/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py
+++ b/components/google-cloud/google_cloud_pipeline_components/preview/llm/rlhf/component.py
@@ -71,7 +71,7 @@ def rlhf_pipeline(
     kl_coeff: Coefficient for KL penalty. This regularizes the policy model and penalizes if it diverges from its initial distribution. If set to 0, the reference language model is not loaded into memory. Default value is 0.1.
     instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
     deploy_model: Whether to deploy the model to an endpoint in `us-central1`. Default is True.
-    eval_dataset: Optional Cloud storage path to an evaluation dataset. Note, eval dataset can only be provided for third-party models. If provided, inference will be performed on this dataset after training. The dataset format is jsonl. Each example in the dataset must contain a field `input_text` that contains the prompt.
+    eval_dataset: Optional Cloud storage path to an evaluation dataset. The dataset format is jsonl. The evaluation dataset can be used to compute train-time metrics (when training a reward model) or perform bulk inference for third-party models. To compute train-time metrics this dataset must contain the same fields as the peference dataset. For bulk inference with third-party models only `input_text` is needed. Note, train-time metrics are only computed for the first 5000 samples in the dataset for efficient evaluation during training.
     project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
     location: Location used to run custom jobs. If not specified the location used to run the pipeline will be used.
     encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
@@ -82,6 +82,10 @@ def rlhf_pipeline(
     endpoint_resource_name: Path the Online Prediction Endpoint. This will be an empty string if the model was not deployed.
   """
   # fmt: on
+  reward_model_eval_dataset = function_based.validate_rlhf_inputs(
+      large_model_reference=large_model_reference,
+      eval_dataset=eval_dataset,
+  ).set_display_name('Validate Inputs')
 
   # LoRA dim for reward model
   reward_lora_dim = 4
@@ -105,6 +109,7 @@ def rlhf_pipeline(
               large_model_reference=large_model_reference,
               prompt_sequence_length=prompt_sequence_length,
               target_sequence_length=target_sequence_length,
+              eval_dataset=reward_model_eval_dataset.output,
               instruction=instruction,
               reward_model_learning_rate_multiplier=reward_model_learning_rate_multiplier,
               reward_model_train_steps=reward_model_train_steps,
@@ -118,7 +123,6 @@ def rlhf_pipeline(
       .set_display_name('Train Reward Model')
       .after(validate_pipeline_task)
   )
-
   rl_model_pipeline = reinforcement_learning_graph.pipeline(
       prompt_dataset=prompt_dataset,
       input_reward_model_path=reward_model_pipeline.outputs[