From a480b47bb8cfddfa9f4c9afbbe70bce8b0f32c62 Mon Sep 17 00:00:00 2001 From: Tony Lee Date: Sat, 26 Oct 2024 07:59:16 -0700 Subject: [PATCH] VHELM v2.1.0 (#3101) --- .../presentation/run_entries_vhelm.conf | 35 ++++- .../presentation/run_entries_vhelm_debug.conf | 27 +++- src/helm/benchmark/run_specs/vlm_run_specs.py | 44 ++++++ .../vision_language/blink_scenario.py | 140 ++++++++++++++++++ .../vision_language/mm_star_scenario.py | 92 ++++++++++++ src/helm/benchmark/static/schema_vhelm.yaml | 136 ++++++++++++----- 6 files changed, 432 insertions(+), 42 deletions(-) create mode 100644 src/helm/benchmark/scenarios/vision_language/blink_scenario.py create mode 100644 src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py diff --git a/src/helm/benchmark/presentation/run_entries_vhelm.conf b/src/helm/benchmark/presentation/run_entries_vhelm.conf index ae498f578d..f5d4495d60 100644 --- a/src/helm/benchmark/presentation/run_entries_vhelm.conf +++ b/src/helm/benchmark/presentation/run_entries_vhelm.conf @@ -11,6 +11,21 @@ entries: [ {description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]} {description: "viz_wiz:model=vlm", priority: 1} + # BLINK + {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]} + + # MM-STAR + {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} + {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} + # Image captioning {description: "flickr30k:model=vlm,num_respondents=1", priority: 1} @@ -43,6 +58,16 @@ entries: [ # Mementos {description: "mementos:subject=dailylife,num_respondents=1,model=vlm", priority: 1} + # BLINK + {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]} + {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]} + {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]} + + # MM-STAR + {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + #################################################################################################################### # Knowledge: Does the model have knowledge about the world or specific domains? #################################################################################################################### @@ -96,6 +121,13 @@ entries: [ {description: "vibe_eval:subject=difficulty-normal,model=vlm,num_respondents=1", priority: 1} {description: "vibe_eval:subject=difficulty-hard,model=vlm,num_respondents=1", priority: 1} + # BLINK + {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]} + {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]} + + # MM-STAR + {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]} + #################################################################################################################### # Bias: Are the generations biased in demographic representation (e.g., gender, skin tone)? #################################################################################################################### @@ -189,9 +221,6 @@ entries: [ # Robustness: Is the model robust to invariant input (text/image) perturbations? #################################################################################################################### - {description: "vqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["vqa_robustness"]} - {description: "a_okvqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["a_okvqa_robustness"]} - {description: "unicorn:subject=OODCV-VQA,model=vlm", priority: 1} {description: "unicorn:subject=Sketchy-VQA,model=vlm", priority: 1} diff --git a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf index ce196dbbe8..1f459c314a 100644 --- a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf +++ b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf @@ -1,9 +1,26 @@ entries: [ - {description: "bingo:subject=Region,model=vlm,num_respondents=1", priority: 1} - {description: "bingo:subject=OCR,model=vlm,num_respondents=1", priority: 1} - {description: "bingo:subject=Factual,model=vlm,num_respondents=1", priority: 1} - {description: "bingo:subject=T2I,model=vlm,num_respondents=1", priority: 1} - {description: "bingo:subject=I2I,model=vlm,num_respondents=1", priority: 1} + {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} + {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} + {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} + {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]} + {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} + {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]} + + {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]} + {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]} + + {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]} + {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]} + {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]} ] \ No newline at end of file diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index 852a334505..5919166e0c 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -887,6 +887,50 @@ def get_real_world_qa_spec() -> RunSpec: ) +@run_spec_function("blink") +def get_blink_spec(category: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.vision_language.blink_scenario.BlinkScenario", + args={"category": category}, + ) + adapter_spec: AdapterSpec = _get_generation_adapter_spec( + instructions="Answer the multiple choice question by just giving the letter of the correct answer.", + max_tokens=1, + ) + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + + run_spec_name: str = "blink" + return RunSpec( + name=f"{run_spec_name}:category={category}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) + + +@run_spec_function("mm_star") +def get_mm_star_spec(category: str) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.vision_language.mm_star_scenario.MMStarScenario", + args={"category": category}, + ) + adapter_spec: AdapterSpec = _get_generation_adapter_spec( + instructions="Answer the multiple choice question by just giving the letter of the correct answer.", + max_tokens=1, + ) + metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs() + + run_spec_name: str = "mm_star" + return RunSpec( + name=f"{run_spec_name}:category={category}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + ) + + @run_spec_function("exams_v") def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec: scenario_spec = ScenarioSpec( diff --git a/src/helm/benchmark/scenarios/vision_language/blink_scenario.py b/src/helm/benchmark/scenarios/vision_language/blink_scenario.py new file mode 100644 index 0000000000..60f8ecd1d8 --- /dev/null +++ b/src/helm/benchmark/scenarios/vision_language/blink_scenario.py @@ -0,0 +1,140 @@ +from typing import List +import os + +from datasets import load_dataset +from tqdm import tqdm + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + VALID_SPLIT, + Instance, + Input, + Output, + Reference, + Scenario, +) +from helm.common.media_object import MediaObject, MultimediaObject +from helm.common.images_utils import generate_hash + + +class BlinkScenario(Scenario): + """ + BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, + but pose significant challenges for VLMs. + + Website: https://zeyofu.github.io/blink/ + + @article{fu2024blink, + title={BLINK: Multimodal Large Language Models Can See but Not Perceive}, + author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth, + Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay}, + journal={arXiv preprint arXiv:2404.12390}, + year={2024} + } + """ + + HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK" + + VALID_CATEGORIES: List[str] = [ + "Art_Style", + "Counting", + "Forensic_Detection", + "Functional_Correspondence", + "IQ_Test", + "Jigsaw", + "Multi-view_Reasoning", + "Object_Localization", + "Relative_Depth", + "Relative_Reflectance", + "Semantic_Correspondence", + "Spatial_Relation", + "Visual_Correspondence", + "Visual_Similarity", + ] + + name = "blink" + description = ( + "BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, " + "but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))." + ) + tags = ["vision-language", "knowledge", "reasoning"] + + def __init__(self, category: str): + super().__init__() + + if category not in self.VALID_CATEGORIES: + raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}") + self._category: str = category + + def get_instances(self, output_path: str) -> List[Instance]: + def save_image(image) -> str: + image_file_name: str = generate_hash(image) + ".jpg" + local_image_path: str = os.path.join(output_path, image_file_name) + if not os.path.exists(local_image_path): + image.save(local_image_path) + return local_image_path + + def get_image_header(image_index: int) -> str: + if image_index == 1: + return "First image:" + elif image_index == 2: + return "Second image:" + elif image_index == 3: + return "Third image:" + elif image_index == 4: + return "Fourth image:" + else: + raise ValueError(f"Invalid image index: {image_index}") + + instances: List[Instance] = [] + for row in tqdm( + load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path) + ): + # Save the image(s) to disk + has_multiple_images: bool = row["image_2"] is not None + content: List[MediaObject] = [] + + if has_multiple_images: + # An example can have up to 4 images + for i in range(1, 5): + image_i = row[f"image_{i}"] + if image_i is None: + break + + # Before each image, include a header text that indicates which number image it is. + # Some prompts refer to specific image numbers within the question, e.g., + # "Given three similar but different images, take the first image as reference. + # Can you tell which one of the latter two images is most similar to the first one? + # Select from the following choices. (A) the second image (B) the third image" + image_path: str = save_image(image_i) + content.extend( + [ + MediaObject(text=get_image_header(i), content_type="text/plain"), + MediaObject(location=image_path, content_type="image/jpeg"), + ] + ) + else: + image1 = row["image_1"] + image1_path: str = save_image(image1) + content.append(MediaObject(location=image1_path, content_type="image/jpeg")) + + # Add the prompt that has both the question and the answer choices + prompt: str = row["prompt"] + # Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer + prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.") + content.append(MediaObject(text=prompt, content_type="text/plain")) + + # The answer has the correct letter choices surrounded by parentheses + paren_letter_answer: str = row["answer"] + assert ( + paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")" + ), f"Unexpected answer format: {paren_letter_answer}" + letter_answer: str = paren_letter_answer[1] + references: List[Reference] = [ + Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]), + ] + instances.append( + Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT) + ) + + return instances diff --git a/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py b/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py new file mode 100644 index 0000000000..48990694ec --- /dev/null +++ b/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py @@ -0,0 +1,92 @@ +from typing import List +import os + +from datasets import load_dataset +from tqdm import tqdm + +from helm.benchmark.scenarios.scenario import ( + CORRECT_TAG, + VALID_SPLIT, + Instance, + Input, + Output, + Reference, + Scenario, +) +from helm.common.media_object import MediaObject, MultimediaObject +from helm.common.images_utils import generate_hash + + +class MMStarScenario(Scenario): + """ + MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously + selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate + the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples + are first roughly selected from current benchmarks with an automated pipeline, strict human review is then + involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced + multi-modal capabilities for the solution. + + Website: https://mmstar-benchmark.github.io/ + + @article{chen2024we, + title={Are We on the Right Way for Evaluating Large Vision-Language Models?}, + author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan, + Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others}, + journal={arXiv preprint arXiv:2403.20330}, + year={2024} + } + """ + + HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar" + + VALID_CATEGORIES: List[str] = [ + "coarse perception", + "fine-grained perception", + "instance reasoning", + "logical reasoning", + "math", + "science & technology", + ] + + name = "mm_star" + description = ( + "MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples " + "meticulously selected by humans." + "([Chen, 2024](https://arxiv.org/abs/2403.20330))." + ) + tags = ["vision-language", "knowledge", "reasoning"] + + def __init__(self, category: str): + super().__init__() + + category = category.replace("_", " ") + if category not in self.VALID_CATEGORIES: + raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}") + self._category: str = category + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + + for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)): + # Filter by category + category: str = row["category"] + if category != self._category: + continue + + # Save the image to disk + image = row["image"] + image_file_name: str = generate_hash(image) + ".jpg" + local_image_path: str = os.path.join(output_path, image_file_name) + if not os.path.exists(local_image_path): + image.save(local_image_path) + + content: List[MediaObject] = [ + MediaObject(location=local_image_path, content_type="image/jpeg"), + MediaObject(text=row["question"], content_type="text/plain"), + ] + references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])] + instances.append( + Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT) + ) + + return instances diff --git a/src/helm/benchmark/static/schema_vhelm.yaml b/src/helm/benchmark/static/schema_vhelm.yaml index b62db797dc..2e4bc39451 100644 --- a/src/helm/benchmark/static/schema_vhelm.yaml +++ b/src/helm/benchmark/static/schema_vhelm.yaml @@ -295,6 +295,8 @@ run_groups: - viz_wiz - flickr30k - pope + - mm_star_perception + - blink_perception - name: reasoning display_name: Reasoning description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input? @@ -305,6 +307,8 @@ run_groups: - seed_bench - mementos - real_world_qa + - mm_star_reasoning + - blink_reasoning - name: knowledge display_name: Knowledge description: Does the model have knowledge about the world and common sense? @@ -314,6 +318,8 @@ run_groups: - mmmu - mme - vibe_eval + - mm_star_knowledge + - blink_knowledge - name: bias display_name: Bias description: Are the generations biased in demographic representation? We focus on gender and skin tone bias. @@ -347,8 +353,6 @@ run_groups: description: Is the model robust to perturbations? We focus on both text and image perturbations. category: Core scenarios subgroups: - - vqa_robustness - - a_okvqa_robustness - unicorn - bingo - name: multilinguality @@ -394,22 +398,6 @@ run_groups: when: "2023" language: English - - name: a_okvqa_robustness - display_name: A-OKVQA (robustness) - description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)). - metric_groups: - - robustness - - general_information - environment: - main_name: exact_match - main_split: valid - taxonomy: - task: multiple-choice question answering - what: Real-world images - who: Human experts - when: "2023" - language: English - - name: a_okvqa_chinese display_name: A-OKVQA (chinese) description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)). @@ -474,6 +462,102 @@ run_groups: when: "2023" language: Swahili + - name: mm_star_perception + display_name: MM-Star (Perception subsets) + description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + + - name: mm_star_reasoning + display_name: MM-Star (Reasoning subsets) + description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + + - name: mm_star_knowledge + display_name: MM-Star (Knowledge subsets) + description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + + - name: blink_perception + display_name: BLINK (Perception subsets) + description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + + - name: blink_knowledge + display_name: BLINK (Knowledge subsets) + description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + + - name: blink_reasoning + display_name: BLINK (Reasoning subsets) + description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)). + metric_groups: + - accuracy + - general_information + environment: + main_name: exact_match + main_split: valid + taxonomy: + task: multiple-choice question answering + what: Real-world images + who: Human experts + when: "2024" + language: English + - name: crossmodal_3600 display_name: Crossmodal 3600 description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522)) @@ -603,22 +687,6 @@ run_groups: when: "2017" language: English - - name: vqa_robustness - display_name: VQAv2 (robustness) - description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)). - metric_groups: - - robustness - - general_information - environment: - main_name: quasi_exact_match - main_split: valid - taxonomy: - task: short-answer question answering - what: Real-world images - who: Human experts - when: "2017" - language: English - - name: math_vista display_name: MathVista description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).