From a480b47bb8cfddfa9f4c9afbbe70bce8b0f32c62 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sat, 26 Oct 2024 07:59:16 -0700
Subject: [PATCH] VHELM v2.1.0 (#3101)

---
 .../presentation/run_entries_vhelm.conf       |  35 ++++-
 .../presentation/run_entries_vhelm_debug.conf |  27 +++-
 src/helm/benchmark/run_specs/vlm_run_specs.py |  44 ++++++
 .../vision_language/blink_scenario.py         | 140 ++++++++++++++++++
 .../vision_language/mm_star_scenario.py       |  92 ++++++++++++
 src/helm/benchmark/static/schema_vhelm.yaml   | 136 ++++++++++++-----
 6 files changed, 432 insertions(+), 42 deletions(-)
 create mode 100644 src/helm/benchmark/scenarios/vision_language/blink_scenario.py
 create mode 100644 src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py

diff --git a/src/helm/benchmark/presentation/run_entries_vhelm.conf b/src/helm/benchmark/presentation/run_entries_vhelm.conf
index ae498f578d..f5d4495d60 100644
--- a/src/helm/benchmark/presentation/run_entries_vhelm.conf
+++ b/src/helm/benchmark/presentation/run_entries_vhelm.conf
@@ -11,6 +11,21 @@ entries: [
     {description: "vqa:model=vlm", priority: 1,  groups: ["vqa_base"]}
     {description: "viz_wiz:model=vlm", priority: 1}
 
+    # BLINK
+    {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}
+
+    # MM-STAR
+    {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+
     # Image captioning
     {description: "flickr30k:model=vlm,num_respondents=1", priority: 1}
 
@@ -43,6 +58,16 @@ entries: [
     # Mementos
     {description: "mementos:subject=dailylife,num_respondents=1,model=vlm", priority: 1}
 
+    # BLINK
+    {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+
+    # MM-STAR
+    {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+
     ####################################################################################################################
     # Knowledge: Does the model have knowledge about the world or specific domains?
     ####################################################################################################################
@@ -96,6 +121,13 @@ entries: [
     {description: "vibe_eval:subject=difficulty-normal,model=vlm,num_respondents=1", priority: 1}
     {description: "vibe_eval:subject=difficulty-hard,model=vlm,num_respondents=1", priority: 1}
 
+    # BLINK
+    {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+    {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+
+    # MM-STAR
+    {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}
+
     ####################################################################################################################
     # Bias: Are the generations biased in demographic representation (e.g., gender, skin tone)?
     ####################################################################################################################
@@ -189,9 +221,6 @@ entries: [
     # Robustness: Is the model robust to invariant input (text/image) perturbations?
     ####################################################################################################################
 
-    {description: "vqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["vqa_robustness"]}
-    {description: "a_okvqa:model=vlm,data_augmentation=robustness", priority: 1, groups: ["a_okvqa_robustness"]}
-
     {description: "unicorn:subject=OODCV-VQA,model=vlm", priority: 1}
     {description: "unicorn:subject=Sketchy-VQA,model=vlm", priority: 1}
 
diff --git a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
index ce196dbbe8..1f459c314a 100644
--- a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
+++ b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
@@ -1,9 +1,26 @@
 entries: [
 
-    {description: "bingo:subject=Region,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=OCR,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=Factual,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=T2I,model=vlm,num_respondents=1", priority: 1}
-    {description: "bingo:subject=I2I,model=vlm,num_respondents=1", priority: 1}
+    {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
+    {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
+    {description: "mm_star:category=science_&_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}
 
+    {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
+    {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}
+
+    {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+    {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}
+
+    {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
+    {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
 ]
\ No newline at end of file
diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
index 852a334505..5919166e0c 100644
--- a/src/helm/benchmark/run_specs/vlm_run_specs.py
+++ b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -887,6 +887,50 @@ def get_real_world_qa_spec() -> RunSpec:
     )
 
 
+@run_spec_function("blink")
+def get_blink_spec(category: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.blink_scenario.BlinkScenario",
+        args={"category": category},
+    )
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
+        max_tokens=1,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+
+    run_spec_name: str = "blink"
+    return RunSpec(
+        name=f"{run_spec_name}:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
+@run_spec_function("mm_star")
+def get_mm_star_spec(category: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.mm_star_scenario.MMStarScenario",
+        args={"category": category},
+    )
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
+        max_tokens=1,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+
+    run_spec_name: str = "mm_star"
+    return RunSpec(
+        name=f"{run_spec_name}:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("exams_v")
 def get_exams_v_spec(language: str, subject_grouped: str, type: str = "image_text") -> RunSpec:
     scenario_spec = ScenarioSpec(
diff --git a/src/helm/benchmark/scenarios/vision_language/blink_scenario.py b/src/helm/benchmark/scenarios/vision_language/blink_scenario.py
new file mode 100644
index 0000000000..60f8ecd1d8
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/blink_scenario.py
@@ -0,0 +1,140 @@
+from typing import List
+import os
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+
+
+class BlinkScenario(Scenario):
+    """
+    BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
+    but pose significant challenges for VLMs.
+
+    Website: https://zeyofu.github.io/blink/
+
+    @article{fu2024blink,
+        title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
+        author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
+        Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
+        journal={arXiv preprint arXiv:2404.12390},
+        year={2024}
+    }
+    """
+
+    HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"
+
+    VALID_CATEGORIES: List[str] = [
+        "Art_Style",
+        "Counting",
+        "Forensic_Detection",
+        "Functional_Correspondence",
+        "IQ_Test",
+        "Jigsaw",
+        "Multi-view_Reasoning",
+        "Object_Localization",
+        "Relative_Depth",
+        "Relative_Reflectance",
+        "Semantic_Correspondence",
+        "Spatial_Relation",
+        "Visual_Correspondence",
+        "Visual_Similarity",
+    ]
+
+    name = "blink"
+    description = (
+        "BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
+        "but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+
+    def __init__(self, category: str):
+        super().__init__()
+
+        if category not in self.VALID_CATEGORIES:
+            raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
+        self._category: str = category
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        def save_image(image) -> str:
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+            return local_image_path
+
+        def get_image_header(image_index: int) -> str:
+            if image_index == 1:
+                return "First image:"
+            elif image_index == 2:
+                return "Second image:"
+            elif image_index == 3:
+                return "Third image:"
+            elif image_index == 4:
+                return "Fourth image:"
+            else:
+                raise ValueError(f"Invalid image index: {image_index}")
+
+        instances: List[Instance] = []
+        for row in tqdm(
+            load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
+        ):
+            # Save the image(s) to disk
+            has_multiple_images: bool = row["image_2"] is not None
+            content: List[MediaObject] = []
+
+            if has_multiple_images:
+                # An example can have up to 4 images
+                for i in range(1, 5):
+                    image_i = row[f"image_{i}"]
+                    if image_i is None:
+                        break
+
+                    # Before each image, include a header text that indicates which number image it is.
+                    # Some prompts refer to specific image numbers within the question, e.g.,
+                    # "Given three similar but different images, take the first image as reference.
+                    # Can you tell which one of the latter two images is most similar to the first one?
+                    # Select from the following choices. (A) the second image (B) the third image"
+                    image_path: str = save_image(image_i)
+                    content.extend(
+                        [
+                            MediaObject(text=get_image_header(i), content_type="text/plain"),
+                            MediaObject(location=image_path, content_type="image/jpeg"),
+                        ]
+                    )
+            else:
+                image1 = row["image_1"]
+                image1_path: str = save_image(image1)
+                content.append(MediaObject(location=image1_path, content_type="image/jpeg"))
+
+            # Add the prompt that has both the question and the answer choices
+            prompt: str = row["prompt"]
+            # Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
+            prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
+            content.append(MediaObject(text=prompt, content_type="text/plain"))
+
+            # The answer has the correct letter choices surrounded by parentheses
+            paren_letter_answer: str = row["answer"]
+            assert (
+                paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
+            ), f"Unexpected answer format: {paren_letter_answer}"
+            letter_answer: str = paren_letter_answer[1]
+            references: List[Reference] = [
+                Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
+            ]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
+            )
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py b/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py
new file mode 100644
index 0000000000..48990694ec
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/mm_star_scenario.py
@@ -0,0 +1,92 @@
+from typing import List
+import os
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+
+
+class MMStarScenario(Scenario):
+    """
+    MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
+    selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
+    the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
+    are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
+    involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
+    multi-modal capabilities for the solution.
+
+    Website: https://mmstar-benchmark.github.io/
+
+    @article{chen2024we,
+      title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
+      author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
+      Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
+      journal={arXiv preprint arXiv:2403.20330},
+      year={2024}
+    }
+    """
+
+    HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
+
+    VALID_CATEGORIES: List[str] = [
+        "coarse perception",
+        "fine-grained perception",
+        "instance reasoning",
+        "logical reasoning",
+        "math",
+        "science & technology",
+    ]
+
+    name = "mm_star"
+    description = (
+        "MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
+        "meticulously selected by humans."
+        "([Chen, 2024](https://arxiv.org/abs/2403.20330))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning"]
+
+    def __init__(self, category: str):
+        super().__init__()
+
+        category = category.replace("_", " ")
+        if category not in self.VALID_CATEGORIES:
+            raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
+        self._category: str = category
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+
+        for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
+            # Filter by category
+            category: str = row["category"]
+            if category != self._category:
+                continue
+
+            # Save the image to disk
+            image = row["image"]
+            image_file_name: str = generate_hash(image) + ".jpg"
+            local_image_path: str = os.path.join(output_path, image_file_name)
+            if not os.path.exists(local_image_path):
+                image.save(local_image_path)
+
+            content: List[MediaObject] = [
+                MediaObject(location=local_image_path, content_type="image/jpeg"),
+                MediaObject(text=row["question"], content_type="text/plain"),
+            ]
+            references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
+            instances.append(
+                Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
+            )
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_vhelm.yaml b/src/helm/benchmark/static/schema_vhelm.yaml
index b62db797dc..2e4bc39451 100644
--- a/src/helm/benchmark/static/schema_vhelm.yaml
+++ b/src/helm/benchmark/static/schema_vhelm.yaml
@@ -295,6 +295,8 @@ run_groups:
       - viz_wiz
       - flickr30k
       - pope
+      - mm_star_perception
+      - blink_perception
   - name: reasoning
     display_name: Reasoning
     description: Does the model understand objects, counts and spatial relations? Can the model reason about both the text and image input?
@@ -305,6 +307,8 @@ run_groups:
       - seed_bench
       - mementos
       - real_world_qa
+      - mm_star_reasoning
+      - blink_reasoning
   - name: knowledge
     display_name: Knowledge
     description: Does the model have knowledge about the world and common sense?
@@ -314,6 +318,8 @@ run_groups:
       - mmmu
       - mme
       - vibe_eval
+      - mm_star_knowledge
+      - blink_knowledge
   - name: bias
     display_name: Bias
     description: Are the generations biased in demographic representation? We focus on gender and skin tone bias.
@@ -347,8 +353,6 @@ run_groups:
     description: Is the model robust to perturbations? We focus on both text and image perturbations.
     category: Core scenarios
     subgroups:
-      - vqa_robustness
-      - a_okvqa_robustness
       - unicorn
       - bingo
   - name: multilinguality
@@ -394,22 +398,6 @@ run_groups:
       when: "2023"
       language: English
 
-  - name: a_okvqa_robustness
-    display_name: A-OKVQA (robustness)
-    description: Robustness Typos Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
-    metric_groups:
-      - robustness
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: valid
-    taxonomy:
-      task: multiple-choice question answering
-      what: Real-world images
-      who: Human experts
-      when: "2023"
-      language: English
-
   - name: a_okvqa_chinese
     display_name: A-OKVQA (chinese)
     description: Chinese Translation Perturbation + A-OKVQA ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718)).
@@ -474,6 +462,102 @@ run_groups:
       when: "2023"
       language: Swahili
 
+  - name: mm_star_perception
+    display_name: MM-Star (Perception subsets)
+    description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
+  - name: mm_star_reasoning
+    display_name: MM-Star (Reasoning subsets)
+    description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
+  - name: mm_star_knowledge
+    display_name: MM-Star (Knowledge subsets)
+    description: MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously selected by humans. ([Chen et al., 2024](https://arxiv.org/abs/2403.20330)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
+  - name: blink_perception
+    display_name: BLINK (Perception subsets)
+    description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
+  - name: blink_knowledge
+    display_name: BLINK (Knowledge subsets)
+    description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
+  - name: blink_reasoning
+    display_name: BLINK (Reasoning subsets)
+    description: BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, but difficulty for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: valid
+    taxonomy:
+      task: multiple-choice question answering
+      what: Real-world images
+      who: Human experts
+      when: "2024"
+      language: English
+
   - name: crossmodal_3600
     display_name: Crossmodal 3600
     description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([Thapliyal et al., 2022](https://arxiv.org/abs/2205.12522))
@@ -603,22 +687,6 @@ run_groups:
       when: "2017"
       language: English
 
-  - name: vqa_robustness
-    display_name: VQAv2 (robustness)
-    description: Robustness Typos Perturbation + Open-ended questions about real-world images ([Goyal et al., 2017](https://arxiv.org/abs/1612.00837)).
-    metric_groups:
-      - robustness
-      - general_information
-    environment:
-      main_name: quasi_exact_match
-      main_split: valid
-    taxonomy:
-      task: short-answer question answering
-      what: Real-world images
-      who: Human experts
-      when: "2017"
-      language: English
-
   - name: math_vista
     display_name: MathVista
     description: A benchmark designed to combine challenges from diverse mathematical and visual tasks ([Lu et al., 2024](https://arxiv.org/abs/2310.02255)).