From 5c0bd546bbc7f76b7c39c8d461414bf5997d58af Mon Sep 17 00:00:00 2001
From: Baber <baber@hey.com>
Date: Fri, 13 Sep 2024 22:33:50 +0500
Subject: [PATCH 1/3] del mathvista

---
 lm_eval/tasks/mathvista/testmini.yaml | 17 -----------------
 lm_eval/tasks/mathvista/utils.py      |  5 -----
 2 files changed, 22 deletions(-)
 delete mode 100644 lm_eval/tasks/mathvista/testmini.yaml
 delete mode 100644 lm_eval/tasks/mathvista/utils.py
diff --git a/lm_eval/tasks/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml
deleted file mode 100644
index 52b5384041..0000000000
--- a/lm_eval/tasks/mathvista/testmini.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-dataset_path: AI4Math/MathVista
-task: mathvista_mcq
-test_split: testmini
-output_type: multiple_choice
-process_docs: !function utils.process_docs
-doc_to_image: !function utils.doc_to_image
-doc_to_text: "<image>{{query}}"
-doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
-doc_to_target: "{{choices.index(answer)}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-metadata:
-  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
diff --git a/lm_eval/tasks/mathvista/utils.py b/lm_eval/tasks/mathvista/utils.py
deleted file mode 100644
index 19c64035ea..0000000000
--- a/lm_eval/tasks/mathvista/utils.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import datasets
-
-
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    return dataset.filter(lambda x: x["question_type"].strip() == "multi_choice")

From a3bb2f15006ac113655ff52d13c6e6d3ac051c77 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:35:35 -0400
Subject: [PATCH 2/3] add some sample scores

---
 lm_eval/tasks/mmmu/README.md | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
index 814e5344da..1c1c3215b5 100644
--- a/lm_eval/tasks/mmmu/README.md
+++ b/lm_eval/tasks/mmmu/README.md
@@ -64,6 +64,74 @@ Note: Some questions have multiple images in context. To control for this use `m
 
 The `mmmu_val` group implements MMMU using processing code [from the original MMMU authors](https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu) and uses the prompt format found in [the MMMU repository for Llava-1.5](https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/configs/llava1.5.yaml). This implementation should give scores on par with or slightly higher than those reported by [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mmmu) for `mmmu_val` and the MMMU repository code.
 
+Scores on several tested models (**all with `--apply_chat_template`**) are:
+
+Qwen2-VL-2B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3778|±  |0.0155|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5500|±  |0.0415|
+| - Business                     |      0|none  |      |acc   |↑  |0.3600|±  |0.0389|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3667|±  |0.0394|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5167|±  |0.0438|
+| - Science                      |      0|none  |      |acc   |↑  |0.2467|±  |0.0352|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3143|±  |0.0317|
+```
+
+Qwen2-VL-7B:
+```
+hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.5056|±  |0.0160|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6917|±  |0.0398|
+| - Business                     |      0|none  |      |acc   |↑  |0.4333|±  |0.0406|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.5667|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.6750|±  |0.0426|
+| - Science                      |      0|none  |      |acc   |↑  |0.3800|±  |0.0392|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.4000|±  |0.0341|
+```
+
+Idefics2-8B:
+
+```
+hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.4011|±  |0.0154|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.6167|±  |0.0436|
+| - Business                     |      0|none  |      |acc   |↑  |0.3200|±  |0.0373|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.4000|±  |0.0401|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5750|±  |0.0424|
+| - Science                      |      0|none  |      |acc   |↑  |0.2600|±  |0.0358|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3381|±  |0.0312|
+```
+
+Llava-v1.6-Mistral-7B:
+```
+hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
+```
+```
+|             Groups             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|--------------------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmmu_val                        |      0|none  |      |acc   |↑  |0.3522|±  |0.0151|
+| - Art and Design               |      0|none  |      |acc   |↑  |0.5167|±  |0.0440|
+| - Business                     |      0|none  |      |acc   |↑  |0.2667|±  |0.0362|
+| - Health and Medicine          |      0|none  |      |acc   |↑  |0.3867|±  |0.0397|
+| - Humanities and Social Science|      0|none  |      |acc   |↑  |0.5917|±  |0.0433|
+| - Science                      |      0|none  |      |acc   |↑  |0.2200|±  |0.0342|
+| - Tech and Engineering         |      0|none  |      |acc   |↑  |0.2524|±  |0.0299|
+```
+
+
 
 ### Checklist
 

From 5f76efd2ae467898b191675fd6d5a8f5fdfd8392 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:38:04 -0400
Subject: [PATCH 3/3] Update README.md

---
 lm_eval/tasks/mmmu/README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md
index 1c1c3215b5..e9d0da12f6 100644
--- a/lm_eval/tasks/mmmu/README.md
+++ b/lm_eval/tasks/mmmu/README.md
@@ -81,6 +81,8 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.2467|±  |0.0352|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3143|±  |0.0317|
 ```
+Author-reported score: 41.1%
+
 
 Qwen2-VL-7B:
 ```
@@ -97,9 +99,9 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.3800|±  |0.0392|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.4000|±  |0.0341|
 ```
+Author-reported score: 54.1%
 
 Idefics2-8B:
-
 ```
 hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2
 ```
@@ -114,6 +116,7 @@ hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_at
 | - Science                      |      0|none  |      |acc   |↑  |0.2600|±  |0.0358|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.3381|±  |0.0312|
 ```
+Author-reported score: ~43%
 
 Llava-v1.6-Mistral-7B:
 ```
@@ -130,7 +133,7 @@ hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=
 | - Science                      |      0|none  |      |acc   |↑  |0.2200|±  |0.0342|
 | - Tech and Engineering         |      0|none  |      |acc   |↑  |0.2524|±  |0.0299|
 ```
-
+Author-reported score: 35.3%
 
 
 ### Checklist