From 5c0bd546bbc7f76b7c39c8d461414bf5997d58af Mon Sep 17 00:00:00 2001 From: Baber Date: Fri, 13 Sep 2024 22:33:50 +0500 Subject: [PATCH 1/3] del mathvista --- lm_eval/tasks/mathvista/testmini.yaml | 17 ----------------- lm_eval/tasks/mathvista/utils.py | 5 ----- 2 files changed, 22 deletions(-) delete mode 100644 lm_eval/tasks/mathvista/testmini.yaml delete mode 100644 lm_eval/tasks/mathvista/utils.py diff --git a/lm_eval/tasks/mathvista/testmini.yaml b/lm_eval/tasks/mathvista/testmini.yaml deleted file mode 100644 index 52b5384041..0000000000 --- a/lm_eval/tasks/mathvista/testmini.yaml +++ /dev/null @@ -1,17 +0,0 @@ -dataset_path: AI4Math/MathVista -task: mathvista_mcq -test_split: testmini -output_type: multiple_choice -process_docs: !function utils.process_docs -doc_to_image: !function utils.doc_to_image -doc_to_text: "{{query}}" -doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}' -doc_to_target: "{{choices.index(answer)}}" -metric_list: - - metric: acc - aggregation: mean - higher_is_better: true -metadata: - version: 1.0 -dataset_kwargs: - trust_remote_code: true diff --git a/lm_eval/tasks/mathvista/utils.py b/lm_eval/tasks/mathvista/utils.py deleted file mode 100644 index 19c64035ea..0000000000 --- a/lm_eval/tasks/mathvista/utils.py +++ /dev/null @@ -1,5 +0,0 @@ -import datasets - - -def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: - return dataset.filter(lambda x: x["question_type"].strip() == "multi_choice") From a3bb2f15006ac113655ff52d13c6e6d3ac051c77 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:35:35 -0400 Subject: [PATCH 2/3] add some sample scores --- lm_eval/tasks/mmmu/README.md | 68 ++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md index 814e5344da..1c1c3215b5 100644 --- a/lm_eval/tasks/mmmu/README.md +++ b/lm_eval/tasks/mmmu/README.md @@ -64,6 +64,74 @@ Note: Some questions have multiple images in context. To control for this use `m The `mmmu_val` group implements MMMU using processing code [from the original MMMU authors](https://github.com/MMMU-Benchmark/MMMU/tree/main/mmmu) and uses the prompt format found in [the MMMU repository for Llava-1.5](https://github.com/MMMU-Benchmark/MMMU/blob/main/mmmu/configs/llava1.5.yaml). This implementation should give scores on par with or slightly higher than those reported by [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mmmu) for `mmmu_val` and the MMMU repository code. +Scores on several tested models (**all with `--apply_chat_template`**) are: + +Qwen2-VL-2B: +``` +hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +``` +``` +| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr| +|--------------------------------|------:|------|------|------|---|-----:|---|-----:| +|mmmu_val | 0|none | |acc |↑ |0.3778|± |0.0155| +| - Art and Design | 0|none | |acc |↑ |0.5500|± |0.0415| +| - Business | 0|none | |acc |↑ |0.3600|± |0.0389| +| - Health and Medicine | 0|none | |acc |↑ |0.3667|± |0.0394| +| - Humanities and Social Science| 0|none | |acc |↑ |0.5167|± |0.0438| +| - Science | 0|none | |acc |↑ |0.2467|± |0.0352| +| - Tech and Engineering | 0|none | |acc |↑ |0.3143|± |0.0317| +``` + +Qwen2-VL-7B: +``` +hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +``` +``` +| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr| +|--------------------------------|------:|------|------|------|---|-----:|---|-----:| +|mmmu_val | 0|none | |acc |↑ |0.5056|± |0.0160| +| - Art and Design | 0|none | |acc |↑ |0.6917|± |0.0398| +| - Business | 0|none | |acc |↑ |0.4333|± |0.0406| +| - Health and Medicine | 0|none | |acc |↑ |0.5667|± |0.0401| +| - Humanities and Social Science| 0|none | |acc |↑ |0.6750|± |0.0426| +| - Science | 0|none | |acc |↑ |0.3800|± |0.0392| +| - Tech and Engineering | 0|none | |acc |↑ |0.4000|± |0.0341| +``` + +Idefics2-8B: + +``` +hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +``` +``` +| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr| +|--------------------------------|------:|------|------|------|---|-----:|---|-----:| +|mmmu_val | 0|none | |acc |↑ |0.4011|± |0.0154| +| - Art and Design | 0|none | |acc |↑ |0.6167|± |0.0436| +| - Business | 0|none | |acc |↑ |0.3200|± |0.0373| +| - Health and Medicine | 0|none | |acc |↑ |0.4000|± |0.0401| +| - Humanities and Social Science| 0|none | |acc |↑ |0.5750|± |0.0424| +| - Science | 0|none | |acc |↑ |0.2600|± |0.0358| +| - Tech and Engineering | 0|none | |acc |↑ |0.3381|± |0.0312| +``` + +Llava-v1.6-Mistral-7B: +``` +hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +``` +``` +| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr| +|--------------------------------|------:|------|------|------|---|-----:|---|-----:| +|mmmu_val | 0|none | |acc |↑ |0.3522|± |0.0151| +| - Art and Design | 0|none | |acc |↑ |0.5167|± |0.0440| +| - Business | 0|none | |acc |↑ |0.2667|± |0.0362| +| - Health and Medicine | 0|none | |acc |↑ |0.3867|± |0.0397| +| - Humanities and Social Science| 0|none | |acc |↑ |0.5917|± |0.0433| +| - Science | 0|none | |acc |↑ |0.2200|± |0.0342| +| - Tech and Engineering | 0|none | |acc |↑ |0.2524|± |0.0299| +``` + + ### Checklist From 5f76efd2ae467898b191675fd6d5a8f5fdfd8392 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:38:04 -0400 Subject: [PATCH 3/3] Update README.md --- lm_eval/tasks/mmmu/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lm_eval/tasks/mmmu/README.md b/lm_eval/tasks/mmmu/README.md index 1c1c3215b5..e9d0da12f6 100644 --- a/lm_eval/tasks/mmmu/README.md +++ b/lm_eval/tasks/mmmu/README.md @@ -81,6 +81,8 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-2B-Instruct,attn_implementation=flash_at | - Science | 0|none | |acc |↑ |0.2467|± |0.0352| | - Tech and Engineering | 0|none | |acc |↑ |0.3143|± |0.0317| ``` +Author-reported score: 41.1% + Qwen2-VL-7B: ``` @@ -97,9 +99,9 @@ hf-multimodal (pretrained=Qwen/Qwen2-VL-7B-Instruct,attn_implementation=flash_at | - Science | 0|none | |acc |↑ |0.3800|± |0.0392| | - Tech and Engineering | 0|none | |acc |↑ |0.4000|± |0.0341| ``` +Author-reported score: 54.1% Idefics2-8B: - ``` hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_attention_2,dtype=bfloat16,convert_img_format=True,max_images=2), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 ``` @@ -114,6 +116,7 @@ hf-multimodal (pretrained=HuggingFaceM4/idefics2-8b,attn_implementation=flash_at | - Science | 0|none | |acc |↑ |0.2600|± |0.0358| | - Tech and Engineering | 0|none | |acc |↑ |0.3381|± |0.0312| ``` +Author-reported score: ~43% Llava-v1.6-Mistral-7B: ``` @@ -130,7 +133,7 @@ hf-multimodal (pretrained=llava-hf/llava-v1.6-mistral-7b-hf,attn_implementation= | - Science | 0|none | |acc |↑ |0.2200|± |0.0342| | - Tech and Engineering | 0|none | |acc |↑ |0.2524|± |0.0299| ``` - +Author-reported score: 35.3% ### Checklist