open-compass · tonysy · Nov 20, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .mathbench_gen_10da90 import mathbench_datasets  # noqa: F401, F403
+    from .mathbench_gen_ad37c1 import mathbench_datasets  # noqa: F401, F403
diff --git a/...atasets/MathBench/mathbench_gen_10da90.py → ...atasets/MathBench/mathbench_gen_ad37c1.py b/...atasets/MathBench/mathbench_gen_10da90.py → ...atasets/MathBench/mathbench_gen_ad37c1.py
@@ -3,17 +3,17 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
 from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_capital_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
 
 
 single_choice_prompts = {
     "single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题，请你一步一步推理并得到最终的答案选项。回答格式为如下：\n答案选项：A、B、C、D中你认为正确的一个选项\n计算过程：根据题目得到选项答案的一步步过程\n请严格按照上面的格式回答问题，下面是你要回答的题目：\n{question}\n答案选项：",
-    "single_choice_cn": "以下是一道关于数学的单项选择题，请你给出正确的答案选项。\n下面是你要回答的题目：\n{question}\n答案选项：",
+    "single_choice_cn": "以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：",
     "single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please provide the final answer option by step-by-step reasoning. Please answer in the following format:\nAnswer option: A, B, C, or D (the option you believe is correct)\nCalculation process: Step-by-step process to derive the answer option based on the question\nPlease strictly follow the above format to answer the question. Here is the question you need to answer:\n{question}\nAnswer option:",
     "single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
 }
 
-cloze_prompts={
+cloze_prompts = {
     "cloze_cn": [
                 dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？'),
                 dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
@@ -53,15 +53,13 @@
                 dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
                 dict(role='HUMAN', prompt='Q: {question}'),
                 dict(role='BOT', prompt='A: {answer}\n'),
-],
-}
-
+]}
 
 mathbench_sets = {
     'college': ['single_choice_cn', 'cloze_en'],
     'high': ['single_choice_cn', 'single_choice_en'],
     'middle': ['single_choice_cn'],
-    'primary': ['cloze_cn'],
+    'primary': ['cloze_cn']
 }
 
 # Generate reasoning path if set True or just generate the final answer
@@ -75,26 +73,24 @@
 for _split in list(mathbench_sets.keys()):
     for _name in mathbench_sets[_split]:
         mathbench_infer_cfg = dict(
-            ice_template=dict(
+            prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(
-                    begin="</E>",
                     round=[
                         dict(
                             role="HUMAN",
                             prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
                         ),
                         dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
                     ),
-                ice_token="</E>",
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=512,),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
         )
 
         mathbench_eval_cfg = dict(
             evaluator=dict(type=CircularEvaluator if 'choice' in _name else AccEvaluator),
-            pred_postprocessor=dict(type=first_capital_postprocess) if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
+            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
 
         mathbench_datasets.append(
             dict(
@@ -110,5 +106,3 @@
                 infer_cfg=mathbench_infer_cfg,
                 eval_cfg=mathbench_eval_cfg,
             ))
-
-del _split, _name
diff --git a/configs/summarizers/mathbench.py b/configs/summarizers/mathbench.py
@@ -0,0 +1,18 @@
+summarizer = dict(
+    dataset_abbrs=[
+        '######## MathBench Accuracy ########', # category
+        ['mathbench-college-single_choice_cn', 'acc_1'],
+        ['mathbench-college-cloze_en', 'accuracy'],
+        ['mathbench-high-single_choice_cn', 'acc_1'],
+        ['mathbench-high-single_choice_en', 'acc_1'],
+        ['mathbench-middle-single_choice_cn', 'acc_1'],
+        ['mathbench-primary-cloze_cn', 'accuracy'],
+        '######## MathBench CircularEval ########', # category
+        ['mathbench-college-single_choice_cn', 'perf_4'],
+        ['mathbench-high-single_choice_cn', 'perf_4'],
+        ['mathbench-high-single_choice_en', 'perf_4'],
+        ['mathbench-middle-single_choice_cn', 'perf_4'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
diff --git a/opencompass/datasets/mathbench.py b/opencompass/datasets/mathbench.py
@@ -71,10 +71,15 @@ def load(path: str, name: str, with_circular: bool = True):
                     else:
                         question = entry['question'].strip(
                         ) + '\n' + get_number(entry['options'])
-                        data.append({
+                        info = {
                             'question': question,
                             'answer': entry['answer'].strip()
-                        })
+                        }
+                        # For PPL evaluation
+                        for i in range(4):
+                            info[chr(ord('A') +
+                                     i)] = entry['options'][i].strip()
+                        data.append(info)
 
         dataset = Dataset.from_list(data)
         return dataset
@@ -91,7 +96,7 @@ def mathbench_postprocess(text: str, name: str) -> str:
         ans = ans_line[1].strip()
 
     output = re.sub(r'(\d),(\d)', r'\1\2', ans)
-    numbers = re.findall(r'-?\d*\.?\d+|\d+', output)
+    numbers = re.findall(r'-?\d*\.?/?\d+|\d+', output)
     if numbers:
         return numbers[-1]
 

diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
@@ -53,9 +53,12 @@ def first_option_postprocess(text: str, options: str) -> str:
 
     patterns = [
         f'[Tt]he answer is [{options}]',
-        f'[Tt]he correct answer is [{options}]',
-        f'答案是(.*?)[{options}]',
-        f'答案为(.*?)[{options}]',
+        f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]',  # noqa
+        f'答案(?:选项)?是(.*?)[{options}]',
+        f'答案(?:选项)?为(.*?)[{options}]',
+        f'答案(?:选项)?选(.*?)[{options}]',
+        f'选项[{options}]是?正确',
+        f'选项[{options}]为?正确',
         f'固选(.*?)[{options}]',
         f'答案应该是(.*?)[{options}]',
         f'(\s|^)[{options}][\s。，,\.$]',  # noqa