Skip to content

Commit

Permalink
VHELM v2.1.0 (#3101)
Browse files Browse the repository at this point in the history
  • Loading branch information
teetone authored Oct 26, 2024
1 parent bf666fe commit a480b47
Show file tree
Hide file tree
Showing 6 changed files with 432 additions and 42 deletions.
35 changes: 32 additions & 3 deletions src/helm/benchmark/presentation/run_entries_vhelm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,21 @@ entries: [
{description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]}
{description: "viz_wiz:model=vlm", priority: 1}

# BLINK
{description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}

# MM-STAR
{description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
{description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}

# Image captioning
{description: "flickr30k:model=vlm,num_respondents=1", priority: 1}

Expand Down Expand Up @@ -43,6 +58,16 @@ entries: [
# Mementos
{description: "mementos:subject=dailylife,num_respondents=1,model=vlm", priority: 1}

# BLINK
{description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}

# MM-STAR
{description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}

####################################################################################################################
# Knowledge: Does the model have knowledge about the world or specific domains?
####################################################################################################################
Expand Down Expand Up @@ -96,6 +121,13 @@ entries: [
{description: "vibe_eval:subject=difficulty-normal,model=vlm,num_respondents=1", priority: 1}
{description: "vibe_eval:subject=difficulty-hard,model=vlm,num_respondents=1", priority: 1}

# BLINK
{description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
{description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}

# MM-STAR
{description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}

####################################################################################################################
# Bias: Are the generations biased in demographic representation (e.g., gender, skin tone)?
####################################################################################################################
Expand Down Expand Up @@ -189,9 +221,6 @@ entries: [
# Robustness: Is the model robust to invariant input (text/image) perturbations?
####################################################################################################################

{description: "vqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["vqa_robustness"]}
{description: "a_okvqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["a_okvqa_robustness"]}

{description: "unicorn:subject=OODCV-VQA,model=vlm", priority: 1}
{description: "unicorn:subject=Sketchy-VQA,model=vlm", priority: 1}

Expand Down
27 changes: 22 additions & 5 deletions src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
entries: [

{description: "bingo:subject=Region,model=vlm,num_respondents=1", priority: 1}
{description: "bingo:subject=OCR,model=vlm,num_respondents=1", priority: 1}
{description: "bingo:subject=Factual,model=vlm,num_respondents=1", priority: 1}
{description: "bingo:subject=T2I,model=vlm,num_respondents=1", priority: 1}
{description: "bingo:subject=I2I,model=vlm,num_respondents=1", priority: 1}
{description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
{description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
{description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}

{description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}

{description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
{description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}

{description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
]
44 changes: 44 additions & 0 deletions src/helm/benchmark/run_specs/vlm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,50 @@ def get_real_world_qa_spec() -> RunSpec:
)


@run_spec_function("blink")
def get_blink_spec(category: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.vision_language.blink_scenario.BlinkScenario",
args={"category": category},
)
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
max_tokens=1,
)
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()

run_spec_name: str = "blink"
return RunSpec(
name=f"{run_spec_name}:category={category}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=[run_spec_name],
)


@run_spec_function("mm_star")
def get_mm_star_spec(category: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.vision_language.mm_star_scenario.MMStarScenario",
args={"category": category},
)
adapter_spec: AdapterSpec = _get_generation_adapter_spec(
instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
max_tokens=1,
)
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()

run_spec_name: str = "mm_star"
return RunSpec(
name=f"{run_spec_name}:category={category}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=[run_spec_name],
)


@run_spec_function("exams_v")
def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down
140 changes: 140 additions & 0 deletions src/helm/benchmark/scenarios/vision_language/blink_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from typing import List
import os

from datasets import load_dataset
from tqdm import tqdm

from helm.benchmark.scenarios.scenario import (
CORRECT_TAG,
VALID_SPLIT,
Instance,
Input,
Output,
Reference,
Scenario,
)
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.images_utils import generate_hash


class BlinkScenario(Scenario):
"""
BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
but pose significant challenges for VLMs.
Website: https://zeyofu.github.io/blink/
@article{fu2024blink,
title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
journal={arXiv preprint arXiv:2404.12390},
year={2024}
}
"""

HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"

VALID_CATEGORIES: List[str] = [
"Art_Style",
"Counting",
"Forensic_Detection",
"Functional_Correspondence",
"IQ_Test",
"Jigsaw",
"Multi-view_Reasoning",
"Object_Localization",
"Relative_Depth",
"Relative_Reflectance",
"Semantic_Correspondence",
"Spatial_Relation",
"Visual_Correspondence",
"Visual_Similarity",
]

name = "blink"
description = (
"BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
"but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
)
tags = ["vision-language", "knowledge", "reasoning"]

def __init__(self, category: str):
super().__init__()

if category not in self.VALID_CATEGORIES:
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
self._category: str = category

def get_instances(self, output_path: str) -> List[Instance]:
def save_image(image) -> str:
image_file_name: str = generate_hash(image) + ".jpg"
local_image_path: str = os.path.join(output_path, image_file_name)
if not os.path.exists(local_image_path):
image.save(local_image_path)
return local_image_path

def get_image_header(image_index: int) -> str:
if image_index == 1:
return "First image:"
elif image_index == 2:
return "Second image:"
elif image_index == 3:
return "Third image:"
elif image_index == 4:
return "Fourth image:"
else:
raise ValueError(f"Invalid image index: {image_index}")

instances: List[Instance] = []
for row in tqdm(
load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
):
# Save the image(s) to disk
has_multiple_images: bool = row["image_2"] is not None
content: List[MediaObject] = []

if has_multiple_images:
# An example can have up to 4 images
for i in range(1, 5):
image_i = row[f"image_{i}"]
if image_i is None:
break

# Before each image, include a header text that indicates which number image it is.
# Some prompts refer to specific image numbers within the question, e.g.,
# "Given three similar but different images, take the first image as reference.
# Can you tell which one of the latter two images is most similar to the first one?
# Select from the following choices. (A) the second image (B) the third image"
image_path: str = save_image(image_i)
content.extend(
[
MediaObject(text=get_image_header(i), content_type="text/plain"),
MediaObject(location=image_path, content_type="image/jpeg"),
]
)
else:
image1 = row["image_1"]
image1_path: str = save_image(image1)
content.append(MediaObject(location=image1_path, content_type="image/jpeg"))

# Add the prompt that has both the question and the answer choices
prompt: str = row["prompt"]
# Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
content.append(MediaObject(text=prompt, content_type="text/plain"))

# The answer has the correct letter choices surrounded by parentheses
paren_letter_answer: str = row["answer"]
assert (
paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
), f"Unexpected answer format: {paren_letter_answer}"
letter_answer: str = paren_letter_answer[1]
references: List[Reference] = [
Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
]
instances.append(
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
)

return instances
92 changes: 92 additions & 0 deletions src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from typing import List
import os

from datasets import load_dataset
from tqdm import tqdm

from helm.benchmark.scenarios.scenario import (
CORRECT_TAG,
VALID_SPLIT,
Instance,
Input,
Output,
Reference,
Scenario,
)
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.images_utils import generate_hash


class MMStarScenario(Scenario):
"""
MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
multi-modal capabilities for the solution.
Website: https://mmstar-benchmark.github.io/
@article{chen2024we,
title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
journal={arXiv preprint arXiv:2403.20330},
year={2024}
}
"""

HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"

VALID_CATEGORIES: List[str] = [
"coarse perception",
"fine-grained perception",
"instance reasoning",
"logical reasoning",
"math",
"science & technology",
]

name = "mm_star"
description = (
"MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
"meticulously selected by humans."
"([Chen, 2024](https://arxiv.org/abs/2403.20330))."
)
tags = ["vision-language", "knowledge", "reasoning"]

def __init__(self, category: str):
super().__init__()

category = category.replace("_", " ")
if category not in self.VALID_CATEGORIES:
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
self._category: str = category

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []

for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
# Filter by category
category: str = row["category"]
if category != self._category:
continue

# Save the image to disk
image = row["image"]
image_file_name: str = generate_hash(image) + ".jpg"
local_image_path: str = os.path.join(output_path, image_file_name)
if not os.path.exists(local_image_path):
image.save(local_image_path)

content: List[MediaObject] = [
MediaObject(location=local_image_path, content_type="image/jpeg"),
MediaObject(text=row["question"], content_type="text/plain"),
]
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
instances.append(
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
)

return instances
Loading

0 comments on commit a480b47

Please sign in to comment.