-
Notifications
You must be signed in to change notification settings - Fork 249
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
432 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 22 additions & 5 deletions
27
src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,26 @@ | ||
entries: [ | ||
|
||
{description: "bingo:subject=Region,model=vlm,num_respondents=1", priority: 1} | ||
{description: "bingo:subject=OCR,model=vlm,num_respondents=1", priority: 1} | ||
{description: "bingo:subject=Factual,model=vlm,num_respondents=1", priority: 1} | ||
{description: "bingo:subject=T2I,model=vlm,num_respondents=1", priority: 1} | ||
{description: "bingo:subject=I2I,model=vlm,num_respondents=1", priority: 1} | ||
{description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} | ||
{description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} | ||
{description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} | ||
{description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} | ||
{description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} | ||
{description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]} | ||
|
||
{description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
{description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]} | ||
|
||
{description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]} | ||
{description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]} | ||
|
||
{description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]} | ||
{description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]} | ||
{description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
140 changes: 140 additions & 0 deletions
140
src/helm/benchmark/scenarios/vision_language/blink_scenario.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from typing import List | ||
import os | ||
|
||
from datasets import load_dataset | ||
from tqdm import tqdm | ||
|
||
from helm.benchmark.scenarios.scenario import ( | ||
CORRECT_TAG, | ||
VALID_SPLIT, | ||
Instance, | ||
Input, | ||
Output, | ||
Reference, | ||
Scenario, | ||
) | ||
from helm.common.media_object import MediaObject, MultimediaObject | ||
from helm.common.images_utils import generate_hash | ||
|
||
|
||
class BlinkScenario(Scenario): | ||
""" | ||
BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, | ||
but pose significant challenges for VLMs. | ||
Website: https://zeyofu.github.io/blink/ | ||
@article{fu2024blink, | ||
title={BLINK: Multimodal Large Language Models Can See but Not Perceive}, | ||
author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, | ||
Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, | ||
journal={arXiv preprint arXiv:2404.12390}, | ||
year={2024} | ||
} | ||
""" | ||
|
||
HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK" | ||
|
||
VALID_CATEGORIES: List[str] = [ | ||
"Art_Style", | ||
"Counting", | ||
"Forensic_Detection", | ||
"Functional_Correspondence", | ||
"IQ_Test", | ||
"Jigsaw", | ||
"Multi-view_Reasoning", | ||
"Object_Localization", | ||
"Relative_Depth", | ||
"Relative_Reflectance", | ||
"Semantic_Correspondence", | ||
"Spatial_Relation", | ||
"Visual_Correspondence", | ||
"Visual_Similarity", | ||
] | ||
|
||
name = "blink" | ||
description = ( | ||
"BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, " | ||
"but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))." | ||
) | ||
tags = ["vision-language", "knowledge", "reasoning"] | ||
|
||
def __init__(self, category: str): | ||
super().__init__() | ||
|
||
if category not in self.VALID_CATEGORIES: | ||
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}") | ||
self._category: str = category | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
def save_image(image) -> str: | ||
image_file_name: str = generate_hash(image) + ".jpg" | ||
local_image_path: str = os.path.join(output_path, image_file_name) | ||
if not os.path.exists(local_image_path): | ||
image.save(local_image_path) | ||
return local_image_path | ||
|
||
def get_image_header(image_index: int) -> str: | ||
if image_index == 1: | ||
return "First image:" | ||
elif image_index == 2: | ||
return "Second image:" | ||
elif image_index == 3: | ||
return "Third image:" | ||
elif image_index == 4: | ||
return "Fourth image:" | ||
else: | ||
raise ValueError(f"Invalid image index: {image_index}") | ||
|
||
instances: List[Instance] = [] | ||
for row in tqdm( | ||
load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path) | ||
): | ||
# Save the image(s) to disk | ||
has_multiple_images: bool = row["image_2"] is not None | ||
content: List[MediaObject] = [] | ||
|
||
if has_multiple_images: | ||
# An example can have up to 4 images | ||
for i in range(1, 5): | ||
image_i = row[f"image_{i}"] | ||
if image_i is None: | ||
break | ||
|
||
# Before each image, include a header text that indicates which number image it is. | ||
# Some prompts refer to specific image numbers within the question, e.g., | ||
# "Given three similar but different images, take the first image as reference. | ||
# Can you tell which one of the latter two images is most similar to the first one? | ||
# Select from the following choices. (A) the second image (B) the third image" | ||
image_path: str = save_image(image_i) | ||
content.extend( | ||
[ | ||
MediaObject(text=get_image_header(i), content_type="text/plain"), | ||
MediaObject(location=image_path, content_type="image/jpeg"), | ||
] | ||
) | ||
else: | ||
image1 = row["image_1"] | ||
image1_path: str = save_image(image1) | ||
content.append(MediaObject(location=image1_path, content_type="image/jpeg")) | ||
|
||
# Add the prompt that has both the question and the answer choices | ||
prompt: str = row["prompt"] | ||
# Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer | ||
prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.") | ||
content.append(MediaObject(text=prompt, content_type="text/plain")) | ||
|
||
# The answer has the correct letter choices surrounded by parentheses | ||
paren_letter_answer: str = row["answer"] | ||
assert ( | ||
paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")" | ||
), f"Unexpected answer format: {paren_letter_answer}" | ||
letter_answer: str = paren_letter_answer[1] | ||
references: List[Reference] = [ | ||
Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]), | ||
] | ||
instances.append( | ||
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT) | ||
) | ||
|
||
return instances |
92 changes: 92 additions & 0 deletions
92
src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
from typing import List | ||
import os | ||
|
||
from datasets import load_dataset | ||
from tqdm import tqdm | ||
|
||
from helm.benchmark.scenarios.scenario import ( | ||
CORRECT_TAG, | ||
VALID_SPLIT, | ||
Instance, | ||
Input, | ||
Output, | ||
Reference, | ||
Scenario, | ||
) | ||
from helm.common.media_object import MediaObject, MultimediaObject | ||
from helm.common.images_utils import generate_hash | ||
|
||
|
||
class MMStarScenario(Scenario): | ||
""" | ||
MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously | ||
selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate | ||
the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples | ||
are first roughly selected from current benchmarks with an automated pipeline, strict human review is then | ||
involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced | ||
multi-modal capabilities for the solution. | ||
Website: https://mmstar-benchmark.github.io/ | ||
@article{chen2024we, | ||
title={Are We on the Right Way for Evaluating Large Vision-Language Models?}, | ||
author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan, | ||
Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others}, | ||
journal={arXiv preprint arXiv:2403.20330}, | ||
year={2024} | ||
} | ||
""" | ||
|
||
HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar" | ||
|
||
VALID_CATEGORIES: List[str] = [ | ||
"coarse perception", | ||
"fine-grained perception", | ||
"instance reasoning", | ||
"logical reasoning", | ||
"math", | ||
"science & technology", | ||
] | ||
|
||
name = "mm_star" | ||
description = ( | ||
"MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples " | ||
"meticulously selected by humans." | ||
"([Chen, 2024](https://arxiv.org/abs/2403.20330))." | ||
) | ||
tags = ["vision-language", "knowledge", "reasoning"] | ||
|
||
def __init__(self, category: str): | ||
super().__init__() | ||
|
||
category = category.replace("_", " ") | ||
if category not in self.VALID_CATEGORIES: | ||
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}") | ||
self._category: str = category | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
instances: List[Instance] = [] | ||
|
||
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)): | ||
# Filter by category | ||
category: str = row["category"] | ||
if category != self._category: | ||
continue | ||
|
||
# Save the image to disk | ||
image = row["image"] | ||
image_file_name: str = generate_hash(image) + ".jpg" | ||
local_image_path: str = os.path.join(output_path, image_file_name) | ||
if not os.path.exists(local_image_path): | ||
image.save(local_image_path) | ||
|
||
content: List[MediaObject] = [ | ||
MediaObject(location=local_image_path, content_type="image/jpeg"), | ||
MediaObject(text=row["question"], content_type="text/plain"), | ||
] | ||
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])] | ||
instances.append( | ||
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT) | ||
) | ||
|
||
return instances |
Oops, something went wrong.