From c9c5c5d92e3fc9984c485b0ebd9272cabc11b813 Mon Sep 17 00:00:00 2001 From: liushz Date: Mon, 20 Nov 2023 16:48:55 +0800 Subject: [PATCH] Mathbench update postprocess (#600) * Update mathbench * Update mathbench --- configs/datasets/MathBench/mathbench_gen.py | 2 +- ..._gen_10da90.py => mathbench_gen_ad37c1.py} | 22 +++++++------------ configs/summarizers/mathbench.py | 18 +++++++++++++++ opencompass/datasets/mathbench.py | 11 +++++++--- opencompass/utils/text_postprocessors.py | 9 +++++--- 5 files changed, 41 insertions(+), 21 deletions(-) rename configs/datasets/MathBench/{mathbench_gen_10da90.py => mathbench_gen_ad37c1.py} (94%) create mode 100644 configs/summarizers/mathbench.py diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py index 37dc53ec3..8cf9d3982 100644 --- a/configs/datasets/MathBench/mathbench_gen.py +++ b/configs/datasets/MathBench/mathbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mathbench_gen_10da90 import mathbench_datasets # noqa: F401, F403 + from .mathbench_gen_ad37c1 import mathbench_datasets # noqa: F401, F403 diff --git a/configs/datasets/MathBench/mathbench_gen_10da90.py b/configs/datasets/MathBench/mathbench_gen_ad37c1.py similarity index 94% rename from configs/datasets/MathBench/mathbench_gen_10da90.py rename to configs/datasets/MathBench/mathbench_gen_ad37c1.py index 0a0013b2d..a6fa04ceb 100644 --- a/configs/datasets/MathBench/mathbench_gen_10da90.py +++ b/configs/datasets/MathBench/mathbench_gen_ad37c1.py @@ -3,17 +3,17 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator from opencompass.datasets import MathBenchDataset, mathbench_postprocess -from opencompass.utils.text_postprocessors import first_capital_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess single_choice_prompts = { "single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理并得到最终的答案选项。回答格式为如下:\n答案选项:A、B、C、D中你认为正确的一个选项\n计算过程:根据题目得到选项答案的一步步过程\n请严格按照上面的格式回答问题,下面是你要回答的题目:\n{question}\n答案选项:", - "single_choice_cn": "以下是一道关于数学的单项选择题,请你给出正确的答案选项。\n下面是你要回答的题目:\n{question}\n答案选项:", + "single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:", "single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please provide the final answer option by step-by-step reasoning. Please answer in the following format:\nAnswer option: A, B, C, or D (the option you believe is correct)\nCalculation process: Step-by-step process to derive the answer option based on the question\nPlease strictly follow the above format to answer the question. Here is the question you need to answer:\n{question}\nAnswer option:", "single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:", } -cloze_prompts={ +cloze_prompts = { "cloze_cn": [ dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'), dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'), @@ -53,15 +53,13 @@ dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'), dict(role='HUMAN', prompt='Q: {question}'), dict(role='BOT', prompt='A: {answer}\n'), -], -} - +]} mathbench_sets = { 'college': ['single_choice_cn', 'cloze_en'], 'high': ['single_choice_cn', 'single_choice_en'], 'middle': ['single_choice_cn'], - 'primary': ['cloze_cn'], + 'primary': ['cloze_cn'] } # Generate reasoning path if set True or just generate the final answer @@ -75,10 +73,9 @@ for _split in list(mathbench_sets.keys()): for _name in mathbench_sets[_split]: mathbench_infer_cfg = dict( - ice_template=dict( + prompt_template=dict( type=PromptTemplate, template=dict( - begin="", round=[ dict( role="HUMAN", @@ -86,15 +83,14 @@ ), dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name], ), - ice_token="", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512,), + inferencer=dict(type=GenInferencer, max_out_len=512), ) mathbench_eval_cfg = dict( evaluator=dict(type=CircularEvaluator if 'choice' in _name else AccEvaluator), - pred_postprocessor=dict(type=first_capital_postprocess) if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name)) + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name)) mathbench_datasets.append( dict( @@ -110,5 +106,3 @@ infer_cfg=mathbench_infer_cfg, eval_cfg=mathbench_eval_cfg, )) - -del _split, _name diff --git a/configs/summarizers/mathbench.py b/configs/summarizers/mathbench.py new file mode 100644 index 000000000..ed30fc327 --- /dev/null +++ b/configs/summarizers/mathbench.py @@ -0,0 +1,18 @@ +summarizer = dict( + dataset_abbrs=[ + '######## MathBench Accuracy ########', # category + ['mathbench-college-single_choice_cn', 'acc_1'], + ['mathbench-college-cloze_en', 'accuracy'], + ['mathbench-high-single_choice_cn', 'acc_1'], + ['mathbench-high-single_choice_en', 'acc_1'], + ['mathbench-middle-single_choice_cn', 'acc_1'], + ['mathbench-primary-cloze_cn', 'accuracy'], + '######## MathBench CircularEval ########', # category + ['mathbench-college-single_choice_cn', 'perf_4'], + ['mathbench-high-single_choice_cn', 'perf_4'], + ['mathbench-high-single_choice_en', 'perf_4'], + ['mathbench-middle-single_choice_cn', 'perf_4'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith("_summary_groups")], []) +) diff --git a/opencompass/datasets/mathbench.py b/opencompass/datasets/mathbench.py index dbc8d2634..0a7ebb157 100644 --- a/opencompass/datasets/mathbench.py +++ b/opencompass/datasets/mathbench.py @@ -71,10 +71,15 @@ def load(path: str, name: str, with_circular: bool = True): else: question = entry['question'].strip( ) + '\n' + get_number(entry['options']) - data.append({ + info = { 'question': question, 'answer': entry['answer'].strip() - }) + } + # For PPL evaluation + for i in range(4): + info[chr(ord('A') + + i)] = entry['options'][i].strip() + data.append(info) dataset = Dataset.from_list(data) return dataset @@ -91,7 +96,7 @@ def mathbench_postprocess(text: str, name: str) -> str: ans = ans_line[1].strip() output = re.sub(r'(\d),(\d)', r'\1\2', ans) - numbers = re.findall(r'-?\d*\.?\d+|\d+', output) + numbers = re.findall(r'-?\d*\.?/?\d+|\d+', output) if numbers: return numbers[-1] diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index f36da4582..dfd1cfe56 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -53,9 +53,12 @@ def first_option_postprocess(text: str, options: str) -> str: patterns = [ f'[Tt]he answer is [{options}]', - f'[Tt]he correct answer is [{options}]', - f'答案是(.*?)[{options}]', - f'答案为(.*?)[{options}]', + f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]', # noqa + f'答案(?:选项)?是(.*?)[{options}]', + f'答案(?:选项)?为(.*?)[{options}]', + f'答案(?:选项)?选(.*?)[{options}]', + f'选项[{options}]是?正确', + f'选项[{options}]为?正确', f'固选(.*?)[{options}]', f'答案应该是(.*?)[{options}]', f'(\s|^)[{options}][\s。,,\.$]', # noqa