Support MathVista

OpenGVLab · Feb 4, 2024 · ac6e5c9 · ac6e5c9
1 parent 8fff25b
commit ac6e5c9
Show file tree

Hide file tree

Showing 7 changed files with 977 additions and 66 deletions.
diff --git a/internvl_chat/README.md b/internvl_chat/README.md
@@ -110,9 +110,9 @@ Coming Soon
 
 **MultiModal Benchmark**
 
-| model                                                                             | MME            | MMB<sub>dev/test</sub> | MMB-CN<sub>dev/test</sub> | POPE | MMVP |
-| --------------------------------------------------------------------------------- | -------------- | ---------------------- | ------------------------- | ---- | ---- |
-| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 1672.3 / 341.1 | 76.6 / 75.4            | 71.5 / 70.1               | 87.2 | 44.7 |
+| model                                                                             | MME            | MMB<sub>dev/test</sub> | MMB-CN<sub>dev/test</sub> | POPE | MMVP | MathVista |
+| --------------------------------------------------------------------------------- | -------------- | ---------------------- | ------------------------- | ---- | ---- | --------- |
+| [InternVL-Chat-V1.1](https://huggingface.co/OpenGVLab/InternVL-Chat-Chinese-V1-1) | 1672.3 / 341.1 | 76.6 / 75.4            | 71.5 / 70.1               | 87.2 | 44.7 | 34.5      |
 
 | model                                                                             | MMMU<sub>val/test</sub> | CMMMU<sub>val/test</sub> | Tiny<sub>LVLM</sub> | LLaVA<sub>bench</sub> | MM-Vet |
 | --------------------------------------------------------------------------------- | ----------------------- | ------------------------ | ------------------- | --------------------- | ------ |
@@ -291,6 +291,9 @@ data
 ├── MMVP_VLM
 │   ├── MLLM_VLM Images/
 │   └── Questions.csv
+├── MathVista
+│   ├── annot_testmini.json
+│   └── AI4Math___math_vista/
 ```
 
 </details>
@@ -1005,3 +1008,32 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 sh evaluate.sh <checkpoint> mmvp
 ```
 
 </details>
+
+#### [MathVista](https://github.com/lupantech/MathVista)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/MathVista && cd data/MathVista
+# Execute the following python code
+# from datasets import load_dataset
+# dataset = load_dataset("AI4Math/MathVista")
+# dataset.save_to_disk('./MathVista')
+wget https://huggingface.co/datasets/AI4Math/MathVista/raw/main/annot_testmini.json
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluation</summary>
+
+```bash
+# testmini set
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 sh evaluate.sh <checkpoint> mathvista-testmini
+# test set
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 sh evaluate.sh <checkpoint> mathvista-test
+```
+
+</details>
diff --git a/internvl_chat/eval/mathvista/calculate_score.py b/internvl_chat/eval/mathvista/calculate_score.py
@@ -0,0 +1,258 @@
+import argparse
+import sys
+
+import pandas as pd
+# !pip install python-Levenshtein
+from Levenshtein import distance
+from utilities import *
+
+
+def get_most_similar(prediction, choices):
+    """
+    Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
+    """
+    distances = [distance(prediction, choice) for choice in choices]
+    ind = distances.index(min(distances))
+    return choices[ind]
+    # return min(choices, key=lambda choice: distance(prediction, choice))
+
+
+def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
+    """
+    Normalize the extracted answer to match the answer type
+    """
+    if question_type == 'multi_choice':
+        # make sure the extraction is a string
+        if isinstance(extraction, str):
+            extraction = extraction.strip()
+        else:
+            try:
+                extraction = str(extraction)
+            except:
+                extraction = ''
+
+        # extract "A" from "(A) text"
+        letter = re.findall(r'\(([a-zA-Z])\)', extraction)
+        if len(letter) > 0:
+            extraction = letter[0].upper()
+
+        options = [chr(ord('A') + i) for i in range(len(choices))]
+
+        if extraction in options:
+            # convert option letter to text, e.g. "A" -> "text"
+            ind = options.index(extraction)
+            extraction = choices[ind]
+        else:
+            # select the most similar option
+            extraction = get_most_similar(extraction, choices)
+        assert extraction in choices
+
+    elif answer_type == 'integer':
+        try:
+            extraction = str(int(float(extraction)))
+        except:
+            extraction = None
+
+    elif answer_type == 'float':
+        try:
+            extraction = str(round(float(extraction), precision))
+        except:
+            extraction = None
+
+    elif answer_type == 'list':
+        try:
+            extraction = str(extraction)
+        except:
+            extraction = None
+
+    return extraction
+
+
+def safe_equal(prediction, answer):
+    """
+    Check if the prediction is equal to the answer, even if they are of different types
+    """
+    try:
+        if prediction == answer:
+            return True
+        return False
+    except Exception as e:
+        print(e)
+        return False
+
+
+def get_acc_with_contion(res_pd, key, value):
+    if key == 'skills':
+        # if value in res_pd[key]:
+        total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
+    else:
+        total_pd = res_pd[res_pd[key] == value]
+
+    correct_pd = total_pd[total_pd['true_false'] == True] # noqa: E712
+    acc = '{:.2f}'.format(len(correct_pd) / len(total_pd) * 100)
+    return len(correct_pd), len(total_pd), acc
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_dir', type=str, default='./results')
+    parser.add_argument('--output_file', type=str, default='output.json')
+    parser.add_argument('--score_file', type=str, default='scores.json')
+    parser.add_argument('--gt_file', type=str, default='./data/MathVista/annot_testmini.json', help='ground truth file')
+    parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
+    parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
+    parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
+    parser.add_argument('--random_file', type=str, default='score_random_guess.json')
+    args = parser.parse_args()
+
+    # args
+    output_file = os.path.join(args.output_dir, args.output_file)
+
+    # # quick test
+    # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
+
+    # read json
+    print(f'Reading {output_file}...')
+    results = read_json(output_file)
+
+    # read ground truth
+    print(f'Reading {args.gt_file}...')
+    gts = read_json(args.gt_file)
+
+    # full pids
+    full_pids = list(results.keys())
+    if args.number > 0:
+        full_pids = full_pids[:min(args.number, len(full_pids))]
+    print('Number of testing problems:', len(full_pids))
+
+    ## [1] Evaluate if the prediction is true or false
+    print('\nEvaluating the predictions...')
+    update_json_flag = False
+    for pid in full_pids:
+        problem = results[pid]
+        # print(problem)
+
+        if args.rerun:
+            if 'prediction' in problem:
+                del problem['prediction']
+            if 'true_false' in problem:
+                del problem['true_false']
+
+        choices = problem['choices']
+        question_type = problem['question_type']
+        answer_type = problem['answer_type']
+        precision = problem['precision']
+        extraction = problem['extraction']
+
+        if 'answer' in problem:
+            answer = problem['answer']
+        else:
+            answer = gts[pid]['answer']
+            problem['answer'] = answer
+
+        # normalize the extracted answer to match the answer type
+        prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
+
+        # verify the prediction is true or false
+        true_false = safe_equal(prediction, answer)
+
+        # update the problem
+        if 'true_false' not in problem:
+            update_json_flag = True
+
+        elif true_false != problem['true_false']:
+            update_json_flag = True
+
+        if 'prediction' not in problem:
+            update_json_flag = True
+
+        elif prediction != problem['prediction']:
+            update_json_flag = True
+
+        problem['prediction'] = prediction
+        problem['true_false'] = true_false
+
+    # save the updated json
+    if update_json_flag:
+        print('\n!!!Some problems are updated.!!!')
+        print(f'\nSaving {output_file}...')
+        save_json(results, output_file)
+
+    ## [2] Calculate the average accuracy
+    total = len(full_pids)
+    correct = 0
+    for pid in full_pids:
+        if results[pid]['true_false']:
+            correct += 1
+    accuracy = str(round(correct / total * 100, 2))
+    print(f'\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%')
+
+    scores = {'average': {'accuracy': accuracy, 'correct': correct, 'total': total}}
+
+    ## [3] Calculate the fine-grained accuracy scores
+
+    # merge the 'metadata' attribute into the data
+    for pid in results:
+        results[pid].update(results[pid].pop('metadata'))
+
+    # convert the data to a pandas DataFrame
+    df = pd.DataFrame(results).T
+
+    print(len(df))
+    print('Number of test problems:', len(df))
+    # assert len(df) == 1000 # Important!!!
+
+    # asign the target keys for evaluation
+    target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade',
+                   'skills']
+
+    for key in target_keys:
+        print(f'\nType: [{key}]')
+        # get the unique values of the key
+        if key == 'skills':
+            # the value is a list
+            values = []
+            for i in range(len(df)):
+                values += df[key][i]
+            values = list(set(values))
+        else:
+            values = df[key].unique()
+        # print(values)
+
+        # calculate the accuracy for each value
+        scores[key] = {}
+        for value in values:
+            correct, total, acc = get_acc_with_contion(df, key, value)
+            if total > 0:
+                print(f'[{value}]: {acc}% ({correct}/{total})')
+                scores[key][value] = {'accuracy': acc, 'correct': correct, 'total': total}
+
+        # sort the scores by accuracy
+        scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
+
+    # save the scores
+    scores_file = os.path.join(args.output_dir, args.score_file)
+    print(f'\nSaving {scores_file}...')
+    save_json(scores, scores_file)
+    print('\nDone!')
+
+    # [4] Calculate the score gains over random guess
+    if args.caculate_gain:
+        random_file = os.path.join(args.output_dir, args.random_file)
+        random_scores = json.load(open(random_file))
+
+        print('\nCalculating the score gains...')
+        for key in scores:
+            if key == 'average':
+                gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
+                scores[key]['acc_gain'] = gain
+            else:
+                for sub_key in scores[key]:
+                    gain = round(
+                        float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
+                    scores[key][sub_key]['acc_gain'] = str(gain)
+
+        # save the score gains
+        print(f'\nSaving {scores_file}...')
+        save_json(scores, scores_file)
+        print('\nDone!')