VHELM v2.1.0 (#3101)

stanford-crfm · Oct 26, 2024 · a480b47 · a480b47
1 parent bf666fe
commit a480b47
Show file tree

Hide file tree

Showing 6 changed files with 432 additions and 42 deletions.
diff --git a/src/helm/benchmark/presentation/run_entries_vhelm.conf b/src/helm/benchmark/presentation/run_entries_vhelm.conf
@@ -11,6 +11,21 @@ entries: [
     {description: "vqa:model=vlm", priority: 1,  groups: ["vqa_base"]}
     {description: "viz_wiz:model=vlm", priority: 1}
 
+    # BLINK
+    {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}
+
+    # MM-STAR
+    {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+
     # Image captioning
     {description: "flickr30k:model=vlm,num_respondents=1", priority: 1}
 
@@ -43,6 +58,16 @@ entries: [
     # Mementos
     {description: "mementos:subject=dailylife,num_respondents=1,model=vlm", priority: 1}
 
+    # BLINK
+    {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+
+    # MM-STAR
+    {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+
     ####################################################################################################################
     # Knowledge: Does the model have knowledge about the world or specific domains?
     ####################################################################################################################
@@ -96,6 +121,13 @@ entries: [
     {description: "vibe_eval:subject=difficulty-normal,model=vlm,num_respondents=1", priority: 1}
     {description: "vibe_eval:subject=difficulty-hard,model=vlm,num_respondents=1", priority: 1}
 
+    # BLINK
+    {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+    {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+
+    # MM-STAR
+    {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}
+
     ####################################################################################################################
     # Bias: Are the generations biased in demographic representation (e.g., gender, skin tone)?
     ####################################################################################################################
@@ -189,9 +221,6 @@ entries: [
     # Robustness: Is the model robust to invariant input (text/image) perturbations?
     ####################################################################################################################
 
-    {description: "vqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["vqa_robustness"]}
-    {description: "a_okvqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["a_okvqa_robustness"]}
-
     {description: "unicorn:subject=OODCV-VQA,model=vlm", priority: 1}
     {description: "unicorn:subject=Sketchy-VQA,model=vlm", priority: 1}
 

diff --git a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
@@ -1,9 +1,26 @@
 entries: [
 
-    {description: "bingo:subject=Region,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=OCR,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=Factual,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=T2I,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=I2I,model=vlm,num_respondents=1", priority: 1}
+    {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}
 
+    {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}
+
+    {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+    {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+
+    {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
 ]
diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -887,6 +887,50 @@ def get_real_world_qa_spec() -> RunSpec:
     )
 
 
+@run_spec_function("blink")
+def get_blink_spec(category: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.blink_scenario.BlinkScenario",
+        args={"category": category},
+    )
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
+        max_tokens=1,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+
+    run_spec_name: str = "blink"
+    return RunSpec(
+        name=f"{run_spec_name}:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
+@run_spec_function("mm_star")
+def get_mm_star_spec(category: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.mm_star_scenario.MMStarScenario",
+        args={"category": category},
+    )
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
+        max_tokens=1,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+
+    run_spec_name: str = "mm_star"
+    return RunSpec(
+        name=f"{run_spec_name}:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("exams_v")
 def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec:
     scenario_spec = ScenarioSpec(

diff --git a/src/helm/benchmark/scenarios/vision_language/blink_scenario.py b/src/helm/benchmark/scenarios/vision_language/blink_scenario.py
@@ -0,0 +1,140 @@
+from typing import List
+import os
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+
+
+class BlinkScenario(Scenario):
+    """
+    BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
+    but pose significant challenges for VLMs.
+
+    Website: https://zeyofu.github.io/blink/
+
+    @article{fu2024blink,
+        title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
+        author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
+        Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+        journal={arXiv preprint arXiv:2404.12390},
+        year={2024}
+    }
+    """
+
+    HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"
+
+    VALID_CATEGORIES: List[str] = [
+        "Art_Style",
+        "Counting",
+        "Forensic_Detection",
+        "Functional_Correspondence",
+        "IQ_Test",
+        "Jigsaw",
+        "Multi-view_Reasoning",
+        "Object_Localization",
+        "Relative_Depth",
+        "Relative_Reflectance",
+        "Semantic_Correspondence",
+        "Spatial_Relation",
+        "Visual_Correspondence",
+        "Visual_Similarity",
+    ]
+
+    name = "blink"
+    description = (
+        "BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
+        "but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+
+    def __init__(self, category: str):
+        super().__init__()
+
+        if category not in self.VALID_CATEGORIES:
+            raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
+        self._category: str = category
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        def save_image(image) -> str:
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+            return local_image_path
+
+        def get_image_header(image_index: int) -> str:
+            if image_index == 1:
+                return "First image:"
+            elif image_index == 2:
+                return "Second image:"
+            elif image_index == 3:
+                return "Third image:"
+            elif image_index == 4:
+                return "Fourth image:"
+            else:
+                raise ValueError(f"Invalid image index: {image_index}")
+
+        instances: List[Instance] = []
+        for row in tqdm(
+            load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
+        ):
+            # Save the image(s) to disk
+            has_multiple_images: bool = row["image_2"] is not None
+            content: List[MediaObject] = []
+
+            if has_multiple_images:
+                # An example can have up to 4 images
+                for i in range(1, 5):
+                    image_i = row[f"image_{i}"]
+                    if image_i is None:
+                        break
+
+                    # Before each image, include a header text that indicates which number image it is.
+                    # Some prompts refer to specific image numbers within the question, e.g.,
+                    # "Given three similar but different images, take the first image as reference.
+                    # Can you tell which one of the latter two images is most similar to the first one?
+                    # Select from the following choices. (A) the second image (B) the third image"
+                    image_path: str = save_image(image_i)
+                    content.extend(
+                        [
+                            MediaObject(text=get_image_header(i), content_type="text/plain"),
+                            MediaObject(location=image_path, content_type="image/jpeg"),
+                        ]
+                    )
+            else:
+                image1 = row["image_1"]
+                image1_path: str = save_image(image1)
+                content.append(MediaObject(location=image1_path, content_type="image/jpeg"))
+
+            # Add the prompt that has both the question and the answer choices
+            prompt: str = row["prompt"]
+            # Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
+            prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
+            content.append(MediaObject(text=prompt, content_type="text/plain"))
+
+            # The answer has the correct letter choices surrounded by parentheses
+            paren_letter_answer: str = row["answer"]
+            assert (
+                paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
+            ), f"Unexpected answer format: {paren_letter_answer}"
+            letter_answer: str = paren_letter_answer[1]
+            references: List[Reference] = [
+                Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
+            ]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
+            )
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py b/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py
@@ -0,0 +1,92 @@
+from typing import List
+import os
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+
+
+class MMStarScenario(Scenario):
+    """
+    MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
+    selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
+    the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
+    are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
+    involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
+    multi-modal capabilities for the solution.
+
+    Website: https://mmstar-benchmark.github.io/
+
+    @article{chen2024we,
+      title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
+      author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
+      Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
+      journal={arXiv preprint arXiv:2403.20330},
+      year={2024}
+    }
+    """
+
+    HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
+
+    VALID_CATEGORIES: List[str] = [
+        "coarse perception",
+        "fine-grained perception",
+        "instance reasoning",
+        "logical reasoning",
+        "math",
+        "science & technology",
+    ]
+
+    name = "mm_star"
+    description = (
+        "MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
+        "meticulously selected by humans."
+        "([Chen, 2024](https://arxiv.org/abs/2403.20330))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+
+    def __init__(self, category: str):
+        super().__init__()
+
+        category = category.replace("_", " ")
+        if category not in self.VALID_CATEGORIES:
+            raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
+        self._category: str = category
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
+            # Filter by category
+            category: str = row["category"]
+            if category != self._category:
+                continue
+
+            # Save the image to disk
+            image = row["image"]
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type="image/jpeg"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
+            )
+
+        return instances