Merge branch 'main' into farzaan/add-safety-to-metadata

stanford-crfm · Nov 8, 2024 · b2a46d7 · b2a46d7
2 parents 7727579 + 04a3068
commit b2a46d7
Show file tree

Hide file tree

Showing 32 changed files with 3,011 additions and 102 deletions.
diff --git a/docs/huggingface_models.md b/docs/huggingface_models.md
@@ -1,18 +1,25 @@
 # Hugging Face Model Hub Integration
 
-HELM can be used to evaluate `AutoModelForCausalLM` models (e.g. [`BioMedLM`](https://huggingface.co/stanford-crfm/BioMedLM)) on [Hugging Face Model Hub](https://huggingface.co/models).
+HELM can be used to evaluate `AutoModelForCausalLM` models (e.g. [`BioMedLM`](https://huggingface.co/stanford-crfm/BioMedLM)) on [Hugging Face Model Hub](https://huggingface.co/models) or local disk. Note that only `AutoModelForCausalLM` models are supported; other classes such as `AutoModelForSeq2SeqLM` may be supported in the future.
+
+## Using `model_deployments.yaml`
+
+You can add Hugging Face models using the method discussed in [Adding New Models](adding_new_models.md). This can be used for both models on Hugging Face Hub and local disk. Please refer to that page for instructions for how to do so.
+
+## Using command-line flags
+
+In some cases, you can use command-line flags with `helm-run` to evaluating Hugging Face models. This provides a more convenient way to use Hugging Face models that does not require configuration files.
 
 To use `AutoModelForCausalLM` models from Hugging Face Model Hub, add the Hugging Face model IDs to the `--enable-huggingface-models` flags to `helm-run`. This will make the corresponding Hugging Face models available to use in your run spec descriptions. In the run spec description, use the Hugging Face model ID as the model name.
 
 To use a revision of a model other than the default main revision, append a `@` followed by the revision name to the model ID passed to the `--enable-huggingface-models` flag.
 
-Current restrictions:
+Current restrictions with command-line flags:
 
-- Only `AutoModelForCausalLM` is supported; other classes such as `AutoModelForSeq2SeqLM` may be supported in the future.
 - Models without a namespace are not supported (e.g. `bert-base-uncased`).
-- Models at local file paths are not supported.
+- The model must have `model_max_length` set in the tokenizer configuration.
 
-Examples:
+Example model on Hugging Face Hub:
 
 ```bash
 # Run boolq on stanford-crfm/BioMedLM at the default main revision
@@ -30,6 +37,17 @@ helm-run \
     --max-eval-instances 10
 ```
 
+Example model on local disk:
+
+```bash
+# Run boolq on stanford-crfm/BioMedLM at the default main revision
+helm-run \
+    --run-entries boolq:model=your-org/your-model \
+    --enable-local-huggingface-models path/to/your-org/your-model \
+    --suite v1 \
+    --max-eval-instances 10
+```
+
 To use Optimum Intel, add `--openvino` flag to `helm-run`. Optimum Intel provides a simple interface to optimize Transformer models and convert them to OpenVINO™ Intermediate Representation format to accelerate end-to-end pipelines on Intel® architectures using OpenVINO™ runtime. It runs the model on the CPU.
 
 Examples:

diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json
@@ -3,18 +3,7 @@
 		"title": "Lite",
 		"description": "Lightweight, broad evaluation of the capabilities of language models using in-context learning",
 		"id": "lite",
-		"releases": [
-			"v1.9.0",
-			"v1.8.0",
-			"v1.7.0",
-			"v1.6.0",
-			"v1.5.0",
-			"v1.4.0",
-			"v1.3.0",
-			"v1.2.0",
-			"v1.1.0",
-			"v1.0.0"
-		]
+		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "Classic",
@@ -38,18 +27,7 @@
 		"title": "MMLU",
 		"description": "Massive Multitask Language Understanding (MMLU) evaluations using standardized prompts",
 		"id": "mmlu",
-		"releases": [
-			"v1.9.0",
-			"v1.8.0",
-			"v1.7.0",
-			"v1.6.0",
-			"v1.5.0",
-			"v1.4.0",
-			"v1.3.0",
-			"v1.2.0",
-			"v1.1.0",
-			"v1.0.0"
-		]
+		"releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "VHELM",
@@ -61,13 +39,13 @@
 		"title": "Image2Struct",
 		"description": "Evaluations of Vision-Language Models on extracting structured information from images",
 		"id": "image2struct",
-		"releases": ["v1.0.1", "v1.0.0"]
+		"releases": ["v1.0.2", "v1.0.1", "v1.0.0"]
 	},
 	{
 		"title": "AIR-Bench",
 		"description": "Safety benchmark based on emerging government regulations and company policies",
 		"id": "air-bench",
-		"releases": ["v1.1.0", "v1.0.0"]
+		"releases": ["v1.2.0", "v1.1.0", "v1.0.0"]
 	},
 	{
 		"title": "Safety",

diff --git a/setup.cfg b/setup.cfg
@@ -54,6 +54,8 @@ install_requires=
     scipy~=1.10
     uncertainty-calibration~=0.1.4
     scikit-learn~=1.1
+    jiwer~=3.0
+    rapidfuzz~=3.10
 
     # Models and Metrics Extras
     transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
@@ -146,7 +148,7 @@ mistral =
     mistralai~=1.1
 
 openai =
-    openai~=1.0
+    openai~=1.52
     tiktoken~=0.7
     pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
 
@@ -289,6 +291,8 @@ audiolm =
 
     # For metrics
     pycocoevalcap~=1.2
+    jiwer~=3.0
+    rapidfuzz~=3.10
 
 # Install everything
 all =

diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -8,6 +8,7 @@
 ADAPT_GENERATION: str = "generation"
 ADAPT_LANGUAGE_MODELING: str = "language_modeling"
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
@@ -63,6 +64,12 @@ class AdapterSpec:
     reference_suffix: str = "\n"
     """The string that is included after each reference (for multiple-choice questions)."""
 
+    chain_of_thought_prefix: str = ""
+    """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
+
+    chain_of_thought_suffix: str = "\n"
+    """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
+
     output_prefix: str = "Output: "
     """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
 

diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -3,6 +3,7 @@
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
@@ -19,6 +20,9 @@
 )
 from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
+    MultipleChoiceJointChainOfThoughtAdapter,
+)
 from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 
@@ -38,6 +42,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
             adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
             adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+            adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
             adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:

diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
 
     @staticmethod
     def get_prefix_char(prefix: str) -> str:
-        return prefix.lstrip()[0]
+        return [char for char in prefix if char.isalnum()][0]
 
     @staticmethod
     def get_reference_prefix(prefix: str, i: int) -> str:

diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py
@@ -0,0 +1,87 @@
+from typing import Optional
+
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+
+
+class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
+    """
+    Each `Instance` in a `Scenario` looks like this:
+
+        <input> -> <reference1>
+                   <reference2>
+                   <reference3> [correct]
+                   <reference4>
+
+        <instance_chain_of_thought>
+
+    We can define a label (e.g., letter) for each reference:
+
+        <global_prefix>
+        <instructions>
+        <input_prefix>
+        <input>                  # train
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+
+        <input_prefix>
+        <input>                  # test
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+        <global_suffix>
+
+    In general, each example is:
+
+        <input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
+        <output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
+    """
+
+    def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
+        """Return a list of lines corresponding to this example (part of the prompt)."""
+        # Input
+        result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
+
+        # Include the references
+        delimiter = ", "
+        no_correct_references = "n/a"
+        output = no_correct_references
+        for reference_index, reference in enumerate(instance.references):
+            prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+            result += prefix + reference.output.text + self.adapter_spec.reference_suffix
+            if reference.is_correct:
+                if output == no_correct_references:
+                    output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+                elif self.adapter_spec.multi_label:
+                    output += delimiter
+                    output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+
+        if include_output:
+            chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
+            chain_of_thought_block = (
+                self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
+            )
+            result += (
+                self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
+            )
+        else:
+            result += self.adapter_spec.output_prefix.rstrip()
+
+        return result
diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py
@@ -4,6 +4,7 @@
     ADAPT_GENERATION,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_RANKING_BINARY,
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
     [output_noun]:
     """
 
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
-        input_prefix=f"{input_noun}: " if input_noun is not None else "",
-        input_suffix="\n" if input_noun is not None else "",
-        output_prefix=f"{output_noun}: ",
-        output_suffix="\n",
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        stop_sequences=["\n"],
+        sample_train=sample_train,
+        **kwargs,
+    )
+
+
+def get_multiple_choice_joint_chain_of_thought_adapter_spec(
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]: [output]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]:
+    """
+
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+        instructions=format_instructions(instructions),
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
         max_train_instances=max_train_instances,
         num_outputs=num_outputs,
         max_tokens=max_tokens,
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
             sample_train=sample_train,
             **kwargs,
         )
+    elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+        return get_multiple_choice_joint_chain_of_thought_adapter_spec(
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
+        )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
     else: