From ba620c4afe458ed04962f1fb21f4cf3fd3a04682 Mon Sep 17 00:00:00 2001 From: liushz Date: Tue, 28 May 2024 17:17:54 +0800 Subject: [PATCH 01/12] Update accelerator (#1195) * Add Math Evaluation with Judge Model Evaluator * Add Math Evaluation with Judge Model Evaluator * Add Math Evaluation with Judge Model Evaluator * Add Math Evaluation with Judge Model Evaluator * Fix Llama-3 meta template * Fix MATH with JudgeLM Evaluation * Fix MATH with JudgeLM Evaluation * Fix MATH with JudgeLM Evaluation * Fix MATH with JudgeLM Evaluation * Update acclerator * Update MathBench * Update accelerator --------- Co-authored-by: liuhongwei --- opencompass/utils/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index fbcf60c35..6d65e4546 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -279,7 +279,8 @@ def change_accelerator(models, accelerator): else: raise ValueError(f'Unsupported accelerator {accelerator} for model type {model["type"]}') else: - raise ValueError(f'Unsupported model type {model["type"]}') + acc_model = model + logger.warning(f'Unsupported model type {model["type"]}, will keep the original model.') model_accels.append(acc_model) return model_accels From 2954913d9bb8f65a68468f4bbe0b4ac6b3fdd3ac Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Tue, 28 May 2024 23:09:59 +0800 Subject: [PATCH 02/12] [Sync] bump version (#1204) --- .pre-commit-config.yaml | 1 + configs/datasets/CHARM/charm_rea_gen.py | 4 - .../CHARM/charm_reason_cot_only_gen_f7b7d3.py | 50 +++ configs/datasets/CHARM/charm_reason_gen.py | 4 + .../datasets/CHARM/charm_reason_gen_f8fca2.py | 49 +++ ...gen_f8fca2.py => charm_reason_settings.py} | 50 +-- configs/datasets/GaokaoBench/README.md | 186 +++++++++ ... deprecated_mathbench_agent_gen_48ec47.py} | 0 ... deprecated_mathbench_arith_gen_ccd638.py} | 0 ...=> deprecated_mathbench_cot_gen_66f329.py} | 0 ....py => deprecated_mathbench_gen_7b734b.py} | 0 .../MathBench/mathbench_2024_gen_19e486.py | 114 ------ .../MathBench/mathbench_2024_gen_1dc21d.py | 81 ++++ .../mathbench_2024_wocircular_gen_1dc21d.py | 81 ++++ .../mathbench_2024_wocircular_gen_649349.py | 124 ------ .../mathbench_2024_wocircular_mixed_8eb12b.py | 81 ++++ .../mathbench_2024_wocircular_mixed_dc0207.py | 114 ------ configs/datasets/MathBench/mathbench_gen.py | 2 +- .../datasets/MathBench/mathbench_prompt.py | 103 +++++ configs/datasets/TheoremQA/README.md | 64 +++ configs/datasets/bbh/README.md | 245 ++++++++++++ configs/datasets/ceval/README.md | 367 ++++++++++++++++++ configs/datasets/gpqa/README.md | 64 +++ configs/datasets/gsm8k/README.md | 64 +++ configs/datasets/hellaswag/README.md | 64 +++ configs/datasets/humaneval/README.md | 64 +++ configs/datasets/math/README.md | 64 +++ configs/datasets/mbpp/README.md | 64 +++ configs/datasets/mmlu/README.md | 363 +++++++++++++++++ configs/datasets/nq/README.md | 64 +++ configs/datasets/race/README.md | 64 +++ configs/datasets/triviaqa/README.md | 64 +++ configs/datasets/winogrande/README.md | 64 +++ configs/eval_charm.py | 32 +- configs/eval_subjective_alignbench.py | 1 - configs/eval_subjective_alpacaeval_oc.py | 2 +- .../eval_subjective_alpacaeval_official.py | 2 +- configs/eval_subjective_compassarena.py | 1 - configs/eval_subjective_creationbench.py | 1 - configs/eval_subjective_judge_pandalm.py | 2 +- configs/eval_subjective_mtbench.py | 1 - configs/models/hf_internlm/README.md | 124 ++++++ .../lmdeploy_internlm2_chat_1_8b.py | 23 ++ .../hf_internlm/lmdeploy_internlm2_series.py | 2 + configs/models/hf_llama/hf_llama2_13b.py | 2 +- configs/models/hf_llama/hf_llama2_13b_chat.py | 2 +- configs/models/hf_llama/hf_llama2_70b.py | 2 +- configs/models/hf_llama/hf_llama3_70b.py | 2 +- configs/models/hf_llama/hf_llama_13b.py | 2 +- configs/models/hf_llama/hf_llama_30b.py | 2 +- configs/models/hf_llama/hf_llama_65b.py | 2 +- .../mistral/hf_mistral_7b_instruct_v0_3.py | 12 + configs/models/mistral/hf_mistral_7b_v0_3.py | 13 + configs/models/qwen/README.md | 142 +++++++ configs/models/qwen/hf_qwen1_5_moe_a2_7b.py | 23 +- .../models/qwen/hf_qwen1_5_moe_a2_7b_chat.py | 31 +- configs/models/qwen/hf_qwen2_0_5b.py | 12 + configs/models/qwen/hf_qwen2_1_5b.py | 12 + configs/models/qwen/hf_qwen2_72b.py | 12 + configs/models/qwen/hf_qwen2_7b.py | 12 + configs/models/qwen/hf_qwen2_moe_57b_a14b.py | 12 + configs/models/qwen/hf_qwen_14b.py | 2 +- configs/models/qwen/hf_qwen_14b_chat.py | 25 +- configs/models/qwen/hf_qwen_1_8b_chat.py | 23 +- configs/models/qwen/hf_qwen_72b.py | 2 +- configs/models/qwen/hf_qwen_72b_chat.py | 25 +- configs/models/qwen/hf_qwen_7b_chat.py | 23 +- configs/models/yi/hf_yi_1_5_34b.py | 12 + configs/models/yi/hf_yi_1_5_6b.py | 12 + configs/models/yi/hf_yi_1_5_9b.py | 12 + configs/subjective/model_cfg.py | 6 +- configs/summarizers/charm_rea.py | 98 ----- configs/summarizers/charm_reason.py | 98 +++++ .../summarizers/chat_OC15_multi_faceted.py | 3 +- configs/summarizers/groups/charm_rea.py | 30 -- configs/summarizers/groups/charm_reason.py | 35 ++ configs/summarizers/groups/mathbench_2024.py | 26 ++ opencompass/__init__.py | 2 +- opencompass/datasets/charm.py | 13 +- opencompass/datasets/mbpp.py | 5 +- opencompass/models/__init__.py | 2 +- opencompass/models/huggingface_above_v4_33.py | 1 - opencompass/models/lightllm_api.py | 163 +++++++- opencompass/models/openai_api.py | 147 +------ .../icl_inferencer/icl_base_inferencer.py | 2 + opencompass/runners/dlc.py | 4 + opencompass/summarizers/default.py | 8 +- opencompass/summarizers/multi_faceted.py | 10 +- opencompass/tasks/openicl_eval.py | 2 + opencompass/utils/run.py | 4 +- opencompass/utils/text_postprocessors.py | 18 +- requirements/runtime.txt | 1 + 92 files changed, 3228 insertions(+), 795 deletions(-) delete mode 100644 configs/datasets/CHARM/charm_rea_gen.py create mode 100644 configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py create mode 100644 configs/datasets/CHARM/charm_reason_gen.py create mode 100644 configs/datasets/CHARM/charm_reason_gen_f8fca2.py rename configs/datasets/CHARM/{charm_rea_gen_f8fca2.py => charm_reason_settings.py} (53%) create mode 100644 configs/datasets/GaokaoBench/README.md rename configs/datasets/MathBench/{mathbench_agent_gen_48ec47.py => deprecated_mathbench_agent_gen_48ec47.py} (100%) rename configs/datasets/MathBench/{mathbench_arith_gen_ccd638.py => deprecated_mathbench_arith_gen_ccd638.py} (100%) rename configs/datasets/MathBench/{mathbench_cot_gen_66f329.py => deprecated_mathbench_cot_gen_66f329.py} (100%) rename configs/datasets/MathBench/{mathbench_gen_7b734b.py => deprecated_mathbench_gen_7b734b.py} (100%) delete mode 100644 configs/datasets/MathBench/mathbench_2024_gen_19e486.py create mode 100644 configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py create mode 100644 configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py delete mode 100644 configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py create mode 100644 configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py delete mode 100644 configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py create mode 100644 configs/datasets/MathBench/mathbench_prompt.py create mode 100644 configs/datasets/TheoremQA/README.md create mode 100644 configs/datasets/bbh/README.md create mode 100644 configs/datasets/ceval/README.md create mode 100644 configs/datasets/gpqa/README.md create mode 100644 configs/datasets/gsm8k/README.md create mode 100644 configs/datasets/hellaswag/README.md create mode 100644 configs/datasets/humaneval/README.md create mode 100644 configs/datasets/math/README.md create mode 100644 configs/datasets/mbpp/README.md create mode 100644 configs/datasets/mmlu/README.md create mode 100644 configs/datasets/nq/README.md create mode 100644 configs/datasets/race/README.md create mode 100644 configs/datasets/triviaqa/README.md create mode 100644 configs/datasets/winogrande/README.md create mode 100644 configs/models/hf_internlm/README.md create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py create mode 100644 configs/models/mistral/hf_mistral_7b_instruct_v0_3.py create mode 100644 configs/models/mistral/hf_mistral_7b_v0_3.py create mode 100644 configs/models/qwen/README.md create mode 100644 configs/models/qwen/hf_qwen2_0_5b.py create mode 100644 configs/models/qwen/hf_qwen2_1_5b.py create mode 100644 configs/models/qwen/hf_qwen2_72b.py create mode 100644 configs/models/qwen/hf_qwen2_7b.py create mode 100644 configs/models/qwen/hf_qwen2_moe_57b_a14b.py create mode 100644 configs/models/yi/hf_yi_1_5_34b.py create mode 100644 configs/models/yi/hf_yi_1_5_6b.py create mode 100644 configs/models/yi/hf_yi_1_5_9b.py delete mode 100644 configs/summarizers/charm_rea.py create mode 100644 configs/summarizers/charm_reason.py delete mode 100644 configs/summarizers/groups/charm_rea.py create mode 100644 configs/summarizers/groups/charm_reason.py create mode 100644 configs/summarizers/groups/mathbench_2024.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 63e3d4840..2a94a1e53 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,6 +35,7 @@ repos: exclude: | (?x)^( .*\.jsonl| + .*\.md.template| configs/ ) - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/configs/datasets/CHARM/charm_rea_gen.py b/configs/datasets/CHARM/charm_rea_gen.py deleted file mode 100644 index e98b0d086..000000000 --- a/configs/datasets/CHARM/charm_rea_gen.py +++ /dev/null @@ -1,4 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .charm_rea_gen_f8fca2 import charm_rea_datasets # noqa: F401, F403 diff --git a/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py b/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py new file mode 100644 index 000000000..a9485f442 --- /dev/null +++ b/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py @@ -0,0 +1,50 @@ +import os +from mmengine.config import read_base + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator + +with read_base(): + from .charm_reason_settings import charm_tasks, settings + +settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']] + +charm_reason_datasets = [] + +for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings: + for _task in charm_tasks: + _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt') + with open(_fewshot_example_file, 'r') as f: + _hint = f.read() + + charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target') + + charm_reason_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + charm_reason_eval_cfg = dict( + evaluator=dict(type=CharmReasonEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=charm_reason_postprocess), + dataset_postprocessor=dict(type=charm_reason_postprocess), + ) + + charm_reason_datasets.append( + dict( + type=CharmDataset, + path=dataset_path, + name=_task, + abbr='charm-reason-' + _task + '_' + _cot, + reader_cfg=charm_reason_reader_cfg, + infer_cfg=charm_reason_infer_cfg.copy(), + eval_cfg=charm_reason_eval_cfg.copy(), + ) + ) diff --git a/configs/datasets/CHARM/charm_reason_gen.py b/configs/datasets/CHARM/charm_reason_gen.py new file mode 100644 index 000000000..78ba9697d --- /dev/null +++ b/configs/datasets/CHARM/charm_reason_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .charm_reason_gen_f8fca2 import charm_reason_datasets # noqa: F401, F403 diff --git a/configs/datasets/CHARM/charm_reason_gen_f8fca2.py b/configs/datasets/CHARM/charm_reason_gen_f8fca2.py new file mode 100644 index 000000000..07374df92 --- /dev/null +++ b/configs/datasets/CHARM/charm_reason_gen_f8fca2.py @@ -0,0 +1,49 @@ +import os +from mmengine.config import read_base + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator + +with read_base(): + from .charm_reason_settings import charm_tasks, settings + + +charm_reason_datasets = [] + +for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings: + for _task in charm_tasks: + _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt') + with open(_fewshot_example_file, 'r') as f: + _hint = f.read() + + charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target') + + charm_reason_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + charm_reason_eval_cfg = dict( + evaluator=dict(type=CharmReasonEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=charm_reason_postprocess), + dataset_postprocessor=dict(type=charm_reason_postprocess), + ) + + charm_reason_datasets.append( + dict( + type=CharmDataset, + path=dataset_path, + name=_task, + abbr='charm-reason-' + _task + '_' + _cot, + reader_cfg=charm_reason_reader_cfg, + infer_cfg=charm_reason_infer_cfg.copy(), + eval_cfg=charm_reason_eval_cfg.copy(), + ) + ) diff --git a/configs/datasets/CHARM/charm_rea_gen_f8fca2.py b/configs/datasets/CHARM/charm_reason_settings.py similarity index 53% rename from configs/datasets/CHARM/charm_rea_gen_f8fca2.py rename to configs/datasets/CHARM/charm_reason_settings.py index f9af1c6ee..b5c4fbde6 100644 --- a/configs/datasets/CHARM/charm_rea_gen_f8fca2.py +++ b/configs/datasets/CHARM/charm_reason_settings.py @@ -1,8 +1,4 @@ import os -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import CharmDataset, charm_rea_postprocess, CharmReaEvaluator charm_tasks = [ 'Chinese_Anachronisms_Judgment', @@ -21,16 +17,16 @@ 'Global_Time_Understanding', ] +XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n' +Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: ' +Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:' + data_dir = 'data/CHARM' dataset_path_ZH = f'{data_dir}/reasoning' dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN' fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples') fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN') -XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n' -Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: ' -Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:' - settings = [ ('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template), ('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template), @@ -38,41 +34,3 @@ ('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template), ('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template), ] - -charm_rea_datasets = [] - -for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings: - for _task in charm_tasks: - _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt') - with open(_fewshot_example_file, 'r') as f: - _hint = f.read() - - charm_rea_reader_cfg = dict(input_columns=['input'], output_column='target') - - charm_rea_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ) - - charm_rea_eval_cfg = dict( - evaluator=dict(type=CharmReaEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type=charm_rea_postprocess), - dataset_postprocessor=dict(type=charm_rea_postprocess), - ) - - charm_rea_datasets.append( - dict( - type=CharmDataset, - path=dataset_path, - name=_task, - abbr='charm-rea-' + _task + '_' + _cot, - reader_cfg=charm_rea_reader_cfg, - infer_cfg=charm_rea_infer_cfg.copy(), - eval_cfg=charm_rea_eval_cfg.copy(), - ) - ) diff --git a/configs/datasets/GaokaoBench/README.md b/configs/datasets/GaokaoBench/README.md new file mode 100644 index 000000000..d05c091cc --- /dev/null +++ b/configs/datasets/GaokaoBench/README.md @@ -0,0 +1,186 @@ +# GaokaoBench + +## Base Models + +| model | GaokaoBench | +|:------------------------:|--------------:| +| llama-7b-turbomind | 14.55 | +| llama-13b-turbomind | 16.20 | +| llama-30b-turbomind | 16.14 | +| llama-65b-turbomind | 13.31 | +| llama-2-7b-turbomind | 15.02 | +| llama-2-13b-turbomind | 14.86 | +| llama-2-70b-turbomind | 16.36 | +| llama-3-8b-turbomind | 20.88 | +| llama-3-70b-turbomind | 19.98 | +| internlm2-1.8b-turbomind | 23.78 | +| internlm2-7b-turbomind | 41.41 | +| internlm2-20b-turbomind | 58.99 | +| qwen-1.8b-turbomind | 22.11 | +| qwen-7b-turbomind | 35.32 | +| qwen-14b-turbomind | 54.07 | +| qwen-72b-turbomind | 77.56 | +| qwen1.5-0.5b-hf | 30.67 | +| qwen1.5-1.8b-hf | 35.66 | +| qwen1.5-4b-hf | 54.31 | +| qwen1.5-7b-hf | 65.99 | +| qwen1.5-14b-hf | 66.60 | +| qwen1.5-32b-hf | 79.01 | +| qwen1.5-72b-hf | 80.26 | +| qwen1.5-moe-a2-7b-hf | 52.79 | +| mistral-7b-v0.1-hf | 14.35 | +| mistral-7b-v0.2-hf | 11.10 | +| mixtral-8x7b-v0.1-hf | 8.40 | +| mixtral-8x22b-v0.1-hf | 16.23 | +| yi-6b-hf | 31.70 | +| yi-34b-hf | 30.51 | +| deepseek-7b-base-hf | 17.02 | +| deepseek-67b-base-hf | 10.14 | + +### Details + +| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs | +|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:| +| llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 | +| llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 | +| llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 | +| llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 | +| llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 | +| llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 | +| llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 | +| llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 | +| llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 | +| internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 | +| internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 | +| internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 | +| qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 | +| qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 | +| qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 | +| qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 | +| qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 | +| qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 | +| qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 | +| qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 | +| qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 | +| qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 | +| qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 | +| qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 | +| mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 | +| mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 | +| mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 | +| mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 | +| yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 | +| yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 | +| deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 | +| deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 | + +| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs | +|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:| +| llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 | +| llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 | +| llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 | +| llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 | +| llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 | +| llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 | +| llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 | +| llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 | +| llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 | +| internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 | +| internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 | +| internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 | +| qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 | +| qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 | +| qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 | +| qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 | +| qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 | +| qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 | +| qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 | +| qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 | +| qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 | +| qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 | +| qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 | +| qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 | +| mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 | +| mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 | +| mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 | +| mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 | +| yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 | +| yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 | +| deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 | +| deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 | + +## Chat Models + +| model | GaokaoBench | +|:-----------------------------:|--------------:| +| qwen1.5-0.5b-chat-hf | 21.51 | +| qwen1.5-1.8b-chat-hf | 46.19 | +| qwen1.5-4b-chat-hf | 59.11 | +| qwen1.5-7b-chat-hf | 70.55 | +| qwen1.5-14b-chat-hf | 80.39 | +| qwen1.5-32b-chat-hf | 86.15 | +| qwen1.5-72b-chat-hf | 88.58 | +| qwen1.5-110b-chat-hf | 89.59 | +| internlm2-chat-1.8b-hf | 29.73 | +| internlm2-chat-1.8b-sft-hf | 28.79 | +| internlm2-chat-7b-hf | 54.54 | +| internlm2-chat-7b-sft-hf | 55.39 | +| internlm2-chat-20b-hf | 57.95 | +| internlm2-chat-20b-sft-hf | 57.62 | +| llama-3-8b-instruct-hf | 45.48 | +| llama-3-70b-instruct-hf | 65.91 | +| llama-3-8b-instruct-lmdeploy | 44.48 | +| llama-3-70b-instruct-lmdeploy | 67.06 | +| mistral-7b-instruct-v0.1-hf | 26.21 | +| mistral-7b-instruct-v0.2-hf | 32.17 | +| mixtral-8x7b-instruct-v0.1-hf | 42.46 | + +### Details + +| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs | +|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:| +| qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 | +| qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 | +| qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 | +| qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 | +| qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 | +| qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 | +| qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 | +| qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 | +| internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 | +| internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 | +| internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 | +| internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 | +| internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 | +| internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 | +| llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 | +| llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 | +| llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 | +| llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 | +| mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 | +| mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 | +| mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 | + +| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs | +|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:| +| qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 | +| qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 | +| qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 | +| qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 | +| qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 | +| qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 | +| qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 | +| qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 | +| internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 | +| internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 | +| internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 | +| internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 | +| internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 | +| internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 | +| llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 | +| llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 | +| llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 | +| llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 | +| mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 | +| mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 | +| mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 | diff --git a/configs/datasets/MathBench/mathbench_agent_gen_48ec47.py b/configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py similarity index 100% rename from configs/datasets/MathBench/mathbench_agent_gen_48ec47.py rename to configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py diff --git a/configs/datasets/MathBench/mathbench_arith_gen_ccd638.py b/configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py similarity index 100% rename from configs/datasets/MathBench/mathbench_arith_gen_ccd638.py rename to configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py diff --git a/configs/datasets/MathBench/mathbench_cot_gen_66f329.py b/configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py similarity index 100% rename from configs/datasets/MathBench/mathbench_cot_gen_66f329.py rename to configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py diff --git a/configs/datasets/MathBench/mathbench_gen_7b734b.py b/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py similarity index 100% rename from configs/datasets/MathBench/mathbench_gen_7b734b.py rename to configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py diff --git a/configs/datasets/MathBench/mathbench_2024_gen_19e486.py b/configs/datasets/MathBench/mathbench_2024_gen_19e486.py deleted file mode 100644 index 6cde6c692..000000000 --- a/configs/datasets/MathBench/mathbench_2024_gen_19e486.py +++ /dev/null @@ -1,114 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator -from opencompass.datasets import MathBenchDataset, mathbench_postprocess -from opencompass.utils.text_postprocessors import first_option_postprocess - -single_choice_prompts = { - 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n', - 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:', - 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:", - 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:', -} - -cloze_prompts = { - 'cloze_cn': [ - dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'), - dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'), - dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'), - dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'), - dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'), - dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'), - dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'), - dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'), - dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'), - dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'), - dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'), - dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'), - dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'), - dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'), - dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'), - dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'), - dict(role='HUMAN', prompt='Q: {question}'), - dict(role='BOT', prompt='A: {answer}'), - ], - 'cloze_en': [ - dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'), - dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'), - dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'), - dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'), - dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'), - dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"), - dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'), - dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'), - dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'), - dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'), - dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'), - dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'), - dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'), - dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'), - dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'), - dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'), - dict(role='HUMAN', prompt='Q: {question}'), - dict(role='BOT', prompt='A: {answer}\n'), -]} - -mathbench_sets = { - # Practice Part - 'college': ['single_choice_cn', 'single_choice_en'], - 'high': ['single_choice_cn', 'single_choice_en'], - 'middle': ['single_choice_cn', 'single_choice_en'], - 'primary': ['cloze_cn', 'cloze_en'], - 'arithmetic': ['cloze_en'], - # Theory part - 'college_knowledge': ['single_choice_cn','single_choice_en'], - 'high_knowledge': ['single_choice_cn','single_choice_en'], - 'middle_knowledge': ['single_choice_cn','single_choice_en'], - 'primary_knowledge': ['single_choice_cn','single_choice_en'], -} - -# Generate reasoning path or not, only for single choice -with_reasoning = True - -# Use circular evaluation or not -with_circular_eval = True - -mathbench_datasets = [] - -for _split in list(mathbench_sets.keys()): - for _name in mathbench_sets[_split]: - mathbench_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict( - role='HUMAN', - prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name], - ), - dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name], - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), - ) - - mathbench_eval_cfg = dict( - evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name)) - - mathbench_datasets.append( - dict( - abbr='mathbench-' + _split + '-' + _name, - type=MathBenchDataset, - path=f'./data/mathbench_v1/{_split}', - name=_name, - with_circular=with_circular_eval, - reader_cfg=dict( - input_columns=['question'], - output_column='answer' - ), - infer_cfg=mathbench_infer_cfg, - eval_cfg=mathbench_eval_cfg, - )) diff --git a/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py new file mode 100644 index 000000000..af8e4cd86 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = False +# Use circular evaluation or not +with_circular_eval = True +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py new file mode 100644 index 000000000..67b7847c3 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = False +# Use circular evaluation or not +with_circular_eval = False +# Use PPL mode in single choice test or not +use_ppl_single_choice = False + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-wocircular-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py deleted file mode 100644 index 7cd6790cc..000000000 --- a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py +++ /dev/null @@ -1,124 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator -from opencompass.datasets import MathBenchDataset, mathbench_postprocess -from opencompass.utils.text_postprocessors import first_option_postprocess - -single_choice_prompts = { - 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n', - 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:', - 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:", - 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:', -} - -cloze_prompts = { - 'cloze_cn': [ - dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'), - dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'), - dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'), - dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'), - dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'), - dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'), - dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'), - dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'), - dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'), - dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'), - dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'), - dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'), - dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'), - dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'), - dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'), - dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'), - dict(role='HUMAN', prompt='Q: {question}'), - dict(role='BOT', prompt='A: {answer}'), - ], - 'cloze_en': [ - dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'), - dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'), - dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'), - dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'), - dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'), - dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"), - dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'), - dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'), - dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'), - dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'), - dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'), - dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'), - dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'), - dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'), - dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'), - dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'), - dict(role='HUMAN', prompt='Q: {question}'), - dict(role='BOT', prompt='A: {answer}\n'), - ], -} - -mathbench_sets = { - # Practice Part - 'college': ['single_choice_cn', 'single_choice_en'], - 'high': ['single_choice_cn', 'single_choice_en'], - 'middle': ['single_choice_cn', 'single_choice_en'], - 'primary': ['cloze_cn', 'cloze_en'], - 'calculate': ['cloze_en'], - # Theory part - 'college_knowledge': ['single_choice_cn', 'single_choice_en'], - 'high_knowledge': ['single_choice_cn', 'single_choice_en'], - 'middle_knowledge': ['single_choice_cn', 'single_choice_en'], - 'primary_knowledge': ['single_choice_cn', 'single_choice_en'], -} - -# Generate reasoning path or not, only for single choice -with_reasoning = True - -# Use circular evaluation or not -with_circular_eval = False - -mathbench_datasets = [] - -for _split in list(mathbench_sets.keys()): - for _name in mathbench_sets[_split]: - mathbench_reader_cfg = dict( - input_columns=['question'], - output_column='answer', - ) - - if 'single_choice' in _name: - template_round = [ - dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])), - dict(role='BOT', prompt='{answer}') - ] - else: - template_round = cloze_prompts[_name] - mathbench_infer_cfg = dict( - prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ) - - if 'single_choice' in _name: - pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') - else: - pred_postprocessor = dict(type=mathbench_postprocess, name=_name) - if 'single_choice' in _name and with_circular_eval: - evaluator = dict(type=CircularEvaluator) - else: - evaluator = dict(type=AccEvaluator) - mathbench_eval_cfg = dict( - evaluator=evaluator, - pred_postprocessor=pred_postprocessor, - ) - - mathbench_datasets.append( - dict( - abbr='mathbench-' + _split + '-' + _name, - type=MathBenchDataset, - path=f'./data/mathbench_v1_ori/{_split}', - name=_name, - with_circular=with_circular_eval, - reader_cfg=mathbench_reader_cfg, - infer_cfg=mathbench_infer_cfg, - eval_cfg=mathbench_eval_cfg, - ) - ) diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py new file mode 100644 index 000000000..ac698c208 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py @@ -0,0 +1,81 @@ +from mmengine.config import read_base +from copy import deepcopy +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator +from opencompass.datasets import MathBenchDataset, mathbench_postprocess +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets + +# Max for this dataset is 4 +num_shot = 4 +# Generate reasoning path or not, only for single choice +with_reasoning = False +# Use circular evaluation or not +with_circular_eval = False +# Use PPL mode in single choice test or not +use_ppl_single_choice = True + +assert 0 <= num_shot <= 4 +if num_shot == 0: + prompts = zero_shot_prompts +else: + prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()} + +mathbench_datasets = [] +for _split in mathbench_sets: + for _name in mathbench_sets[_split]: + if 'single_choice' in _name: + if with_reasoning: + template_round = prompts[_name + '_with_reasoning'] + else: + template_round = prompts[_name] + else: + template_round = prompts[_name] + + if 'single_choice' in _name: + pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') + else: + pred_postprocessor = dict(type=mathbench_postprocess, name=_name) + + if 'single_choice' in _name and with_circular_eval: + evaluator = dict(type=CircularEvaluator) + else: + evaluator = dict(type=AccEvaluator) + + # assemble the final config + mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer') + if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning: + template = {} + for answer in ['A', 'B', 'C', 'D']: + one_template_round = deepcopy(template_round) + one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer) + template[answer] = dict(round=one_template_round) + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=template), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + else: + mathbench_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048), + ) + mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor) + + mathbench_datasets.append( + dict( + abbr='mathbench-wocircular-' + _split + '-' + _name, + type=MathBenchDataset, + path=f'data/mathbench_v1/{_split}', + name=_name, + with_circular=with_circular_eval, + reader_cfg=mathbench_reader_cfg, + infer_cfg=mathbench_infer_cfg, + eval_cfg=mathbench_eval_cfg, + ) + ) diff --git a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py deleted file mode 100644 index 9f6d08c2b..000000000 --- a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py +++ /dev/null @@ -1,114 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer -from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator -from opencompass.datasets import MathBenchDataset, mathbench_postprocess -from opencompass.utils.text_postprocessors import first_option_postprocess - -single_choice_prompts = { - 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n', - 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:\n', - 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n", - 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n', -} - -cloze_prompts = { - 'cloze_cn': [ - 'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6', - 'Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5', - 'Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?\nA: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39', - 'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?\nA: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8', - 'Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?\nA: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9', - 'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29', - 'Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?\nA: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33', - 'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8', - 'Q: {question}\nA: {answer}', - ], - 'cloze_en': [ - 'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.', - 'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.', - "Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.", - 'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.', - 'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.', - 'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.', - 'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.', - 'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.', - 'Q: {question}\nA: {answer}', - ], -} - -mathbench_sets = { - # Practice Part - 'college': ['single_choice_cn', 'single_choice_en'], - 'high': ['single_choice_cn', 'single_choice_en'], - 'middle': ['single_choice_cn', 'single_choice_en'], - 'primary': ['cloze_cn', 'cloze_en'], - 'calculate': ['cloze_en'], - # Theory part - 'college_knowledge': ['single_choice_cn', 'single_choice_en'], - 'high_knowledge': ['single_choice_cn', 'single_choice_en'], - 'middle_knowledge': ['single_choice_cn', 'single_choice_en'], - 'primary_knowledge': ['single_choice_cn', 'single_choice_en'], -} - -# Generate reasoning path or not, only for single choice -with_reasoning = False - -# Use circular evaluation or not -with_circular_eval = False - -mathbench_datasets = [] - -for _split in list(mathbench_sets.keys()): - for _name in mathbench_sets[_split]: - mathbench_reader_cfg = dict( - input_columns=['question'], - output_column='answer', - ) - - if 'single_choice' in _name: - if with_reasoning: - mathbench_infer_cfg = dict( - prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ) - else: - mathbench_infer_cfg = dict( - prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=PPLInferencer), - ) - else: - mathbench_infer_cfg = dict( - prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), - ) - - - if 'single_choice' in _name: - pred_postprocessor = dict(type=first_option_postprocess, options='ABCD') - else: - pred_postprocessor = dict(type=mathbench_postprocess, name=_name) - if 'single_choice' in _name and with_circular_eval: - evaluator = dict(type=CircularEvaluator) - else: - evaluator = dict(type=AccEvaluator) - mathbench_eval_cfg = dict( - evaluator=evaluator, - pred_postprocessor=pred_postprocessor, - ) - - mathbench_datasets.append( - dict( - abbr='mathbench-' + _split + '-' + _name, - type=MathBenchDataset, - path=f'./data/mathbench_v1_ori/{_split}', - name=_name, - with_circular=with_circular_eval, - reader_cfg=mathbench_reader_cfg, - infer_cfg=mathbench_infer_cfg, - eval_cfg=mathbench_eval_cfg, - ) - ) diff --git a/configs/datasets/MathBench/mathbench_gen.py b/configs/datasets/MathBench/mathbench_gen.py index 63478a608..90e8e5fe0 100644 --- a/configs/datasets/MathBench/mathbench_gen.py +++ b/configs/datasets/MathBench/mathbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mathbench_2024_gen_19e486 import mathbench_datasets # noqa: F401, F403 + from .mathbench_2024_gen_1dc21d import mathbench_datasets # noqa: F401, F403 diff --git a/configs/datasets/MathBench/mathbench_prompt.py b/configs/datasets/MathBench/mathbench_prompt.py new file mode 100644 index 000000000..069528ee4 --- /dev/null +++ b/configs/datasets/MathBench/mathbench_prompt.py @@ -0,0 +1,103 @@ +zero_shot_prompts = { + 'single_choice_cn_with_reasoning': [ + dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:'), + ], + 'single_choice_cn': [ + dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n所以答案是:'), + ], + 'single_choice_en_with_reasoning': [ + dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'), + ], + 'single_choice_en': [ + dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'), + ], +} + +few_shot_prompts = { + 'single_choice_cn': [ + dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'), + dict(role='BOT', prompt='回答: D'), + dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'), + dict(role='BOT', prompt='回答: D'), + dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'), + dict(role='BOT', prompt='回答: A'), + dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2,高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \\pi$时,该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'), + dict(role='BOT', prompt='回答: B'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ], + 'single_choice_cn_with_reasoning' : [ + dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'), + dict(role='BOT', prompt='回答: 因为首先,我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$,以便消去分母中的虚数部分:$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$,这样做的目的是利用复数乘法的性质,从而消去分母中的虚数部分。我们进行乘法运算:$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$,$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$,由于$i^2 = -1$,我们可以进一步简化:$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$,在复平面上,这个点位于第四象限,因为它的实部是正的($\\frac{1}{2}$),而虚部是负的($-\\frac{1}{2}$)\n答案选项:D'), + dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'), + dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$(其中$\\omega>0$)的图像向左平移2个单位长度后,得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合,我们需要满足以下条件:\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性,我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$,其中$k$是任意整数。因此,我们可以写出:\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程,我们得到:$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$,我们需要找到满足条件的最小正整数$k$。当$k=0$时,我们得到$\\omega=2$,这是满足条件的最小正值。因此,最小值等于2。答案选项:D'), + dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'), + dict(role='BOT', prompt='回答: 我们可以将这个方程写成:$a + 2ai + b = 2i$,然后将实部和虚部分开:\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$,我们可以得到$a = -b$。将这个值代入$2a = 2$,我们得到$2(-b) = 2$,因此$b = -1$。因此,$a = 1$。答案选项:A'), + dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4.则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'), + dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们,如果一个三角形的两条边长分别为$a$和$b$,那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中,$a = 3$,$b = 4$,因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项:A'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ], + 'single_choice_en': [ + dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'), + dict(role='BOT', prompt='Response: D'), + dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'), + dict(role='BOT', prompt='Response: D'), + dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'), + dict(role='BOT', prompt='Response: A'), + dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'), + dict(role='BOT', prompt='Response: B'), + dict(role='HUMAN', prompt='Question: {question}'), + dict(role='BOT', prompt='Response: {answer}'), + ], + 'single_choice_en_with_reasoning': [ + dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'), + dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'), + dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'), + dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'), + dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'), + dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'), + dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'), + dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'), + dict(role='HUMAN', prompt='Question: {question}'), + dict(role='BOT', prompt='Response: {answer}'), + ], + 'cloze_cn': [ + dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'), + dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9'), + dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'), + dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29'), + dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'), + dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33'), + dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'), + dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8'), + dict(role='HUMAN', prompt='问题: {question}'), + dict(role='BOT', prompt='回答: {answer}'), + ], + 'cloze_en': [ + dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'), + dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'), + dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'), + dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'), + dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'), + dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'), + dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'), + dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'), + dict(role='HUMAN', prompt='Question: {question}'), + dict(role='BOT', prompt='Response: {answer}'), + ], +} + +mathbench_sets = { + # Practice Part + 'college': ['single_choice_cn', 'single_choice_en'], + 'high': ['single_choice_cn', 'single_choice_en'], + 'middle': ['single_choice_cn', 'single_choice_en'], + 'primary': ['cloze_cn', 'cloze_en'], + 'arithmetic': ['cloze_en'], + # Theory part + 'college_knowledge': ['single_choice_cn', 'single_choice_en'], + 'high_knowledge': ['single_choice_cn', 'single_choice_en'], + 'middle_knowledge': ['single_choice_cn', 'single_choice_en'], + 'primary_knowledge': ['single_choice_cn', 'single_choice_en'], +} diff --git a/configs/datasets/TheoremQA/README.md b/configs/datasets/TheoremQA/README.md new file mode 100644 index 000000000..d62dddada --- /dev/null +++ b/configs/datasets/TheoremQA/README.md @@ -0,0 +1,64 @@ +# TheoremQA + +## Base Models + +| model | TheoremQA | +|:------------------------:|------------:| +| llama-7b-turbomind | 10.25 | +| llama-13b-turbomind | 11.25 | +| llama-30b-turbomind | 14.25 | +| llama-65b-turbomind | 15.62 | +| llama-2-7b-turbomind | 12.62 | +| llama-2-13b-turbomind | 11.88 | +| llama-2-70b-turbomind | 15.62 | +| llama-3-8b-turbomind | 20.25 | +| llama-3-70b-turbomind | 33.62 | +| internlm2-1.8b-turbomind | 10.50 | +| internlm2-7b-turbomind | 21.88 | +| internlm2-20b-turbomind | 26.00 | +| qwen-1.8b-turbomind | 9.38 | +| qwen-7b-turbomind | 15.00 | +| qwen-14b-turbomind | 21.62 | +| qwen-72b-turbomind | 27.12 | +| qwen1.5-0.5b-hf | 5.88 | +| qwen1.5-1.8b-hf | 12.00 | +| qwen1.5-4b-hf | 13.75 | +| qwen1.5-7b-hf | 4.25 | +| qwen1.5-14b-hf | 12.62 | +| qwen1.5-32b-hf | 26.62 | +| qwen1.5-72b-hf | 26.62 | +| qwen1.5-moe-a2-7b-hf | 7.50 | +| mistral-7b-v0.1-hf | 17.00 | +| mistral-7b-v0.2-hf | 16.25 | +| mixtral-8x7b-v0.1-hf | 24.12 | +| mixtral-8x22b-v0.1-hf | 36.75 | +| yi-6b-hf | 13.88 | +| yi-34b-hf | 24.75 | +| deepseek-7b-base-hf | 12.38 | +| deepseek-67b-base-hf | 21.25 | + +## Chat Models + +| model | TheoremQA | +|:-----------------------------:|------------:| +| qwen1.5-0.5b-chat-hf | 9.00 | +| qwen1.5-1.8b-chat-hf | 9.25 | +| qwen1.5-4b-chat-hf | 13.88 | +| qwen1.5-7b-chat-hf | 12.25 | +| qwen1.5-14b-chat-hf | 13.63 | +| qwen1.5-32b-chat-hf | 19.25 | +| qwen1.5-72b-chat-hf | 22.75 | +| qwen1.5-110b-chat-hf | 17.50 | +| internlm2-chat-1.8b-hf | 13.63 | +| internlm2-chat-1.8b-sft-hf | 12.88 | +| internlm2-chat-7b-hf | 18.50 | +| internlm2-chat-7b-sft-hf | 18.75 | +| internlm2-chat-20b-hf | 23.00 | +| internlm2-chat-20b-sft-hf | 25.12 | +| llama-3-8b-instruct-hf | 19.38 | +| llama-3-70b-instruct-hf | 36.25 | +| llama-3-8b-instruct-lmdeploy | 19.62 | +| llama-3-70b-instruct-lmdeploy | 34.50 | +| mistral-7b-instruct-v0.1-hf | 12.62 | +| mistral-7b-instruct-v0.2-hf | 11.38 | +| mixtral-8x7b-instruct-v0.1-hf | 26.00 | diff --git a/configs/datasets/bbh/README.md b/configs/datasets/bbh/README.md new file mode 100644 index 000000000..e0e35850d --- /dev/null +++ b/configs/datasets/bbh/README.md @@ -0,0 +1,245 @@ +# BBH + +## Base Models + +| model | bbh | +|:------------------------:|------:| +| llama-7b-turbomind | 33.34 | +| llama-13b-turbomind | 37.99 | +| llama-30b-turbomind | 49.86 | +| llama-65b-turbomind | 58.26 | +| llama-2-7b-turbomind | 38.27 | +| llama-2-13b-turbomind | 45.68 | +| llama-2-70b-turbomind | 64.78 | +| llama-3-8b-turbomind | 59.69 | +| llama-3-70b-turbomind | 79.16 | +| internlm2-1.8b-turbomind | 36.03 | +| internlm2-7b-turbomind | 63.56 | +| internlm2-20b-turbomind | 71.29 | +| qwen-1.8b-turbomind | 22.53 | +| qwen-7b-turbomind | 45.89 | +| qwen-14b-turbomind | 56.75 | +| qwen-72b-turbomind | 63.35 | +| qwen1.5-0.5b-hf | 20.54 | +| qwen1.5-1.8b-hf | 27.01 | +| qwen1.5-4b-hf | 34.81 | +| qwen1.5-7b-hf | 39.87 | +| qwen1.5-14b-hf | 50.38 | +| qwen1.5-32b-hf | 67.47 | +| qwen1.5-72b-hf | 58.81 | +| qwen1.5-moe-a2-7b-hf | 39.46 | +| mistral-7b-v0.1-hf | 56.71 | +| mistral-7b-v0.2-hf | 57.32 | +| mixtral-8x7b-v0.1-hf | 68.46 | +| mixtral-8x22b-v0.1-hf | 79.48 | +| yi-6b-hf | 44.82 | +| yi-34b-hf | 66.37 | +| deepseek-7b-base-hf | 42.88 | +| deepseek-67b-base-hf | 71.86 | + +### Details + +| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects | +|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:| +| llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 | +| llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 | +| llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 | +| llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 | +| llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 | +| llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 | +| llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 | +| llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 | +| llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 | +| internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 | +| internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 | +| internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 | +| qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 | +| qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 | +| qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 | +| qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 | +| qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 | +| qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 | +| qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 | +| qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 | +| qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 | +| qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 | +| qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 | +| qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 | +| mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 | +| mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 | +| mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 | +| mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 | +| yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 | +| yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 | +| deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 | +| deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 | + +| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two | +|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:| +| llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 | +| llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 | +| llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 | +| llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 | +| llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 | +| llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 | +| llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 | +| llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 | +| llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 | +| internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 | +| internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 | +| internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 | +| qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 | +| qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 | +| qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 | +| qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 | +| qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 | +| qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 | +| qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 | +| qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 | +| qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 | +| qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 | +| qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 | +| qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 | +| mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 | +| mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 | +| mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 | +| mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 | +| yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 | +| yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 | +| deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 | +| deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 | + +| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies | +|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:| +| llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 | +| llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 | +| llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 | +| llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 | +| llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 | +| llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 | +| llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 | +| llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 | +| llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 | +| internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 | +| internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 | +| internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 | +| qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 | +| qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 | +| qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 | +| qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 | +| qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 | +| qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 | +| qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 | +| qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 | +| qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 | +| qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 | +| qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 | +| qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 | +| mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 | +| mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 | +| mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 | +| mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 | +| yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 | +| yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 | +| deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 | +| deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 | + +## Chat Models + +| model | bbh | +|:-----------------------------:|------:| +| qwen1.5-0.5b-chat-hf | 24.12 | +| qwen1.5-1.8b-chat-hf | 26.82 | +| qwen1.5-4b-chat-hf | 43.15 | +| qwen1.5-7b-chat-hf | 38.12 | +| qwen1.5-14b-chat-hf | 55.38 | +| qwen1.5-32b-chat-hf | 69.28 | +| qwen1.5-72b-chat-hf | 72.97 | +| qwen1.5-110b-chat-hf | 71.04 | +| internlm2-chat-1.8b-hf | 37.69 | +| internlm2-chat-1.8b-sft-hf | 37.12 | +| internlm2-chat-7b-hf | 57.83 | +| internlm2-chat-7b-sft-hf | 57.19 | +| internlm2-chat-20b-hf | 68.24 | +| internlm2-chat-20b-sft-hf | 69.38 | +| llama-3-8b-instruct-hf | 52.85 | +| llama-3-70b-instruct-hf | 82.42 | +| llama-3-8b-instruct-lmdeploy | 53.54 | +| llama-3-70b-instruct-lmdeploy | 82.58 | +| mistral-7b-instruct-v0.1-hf | 32.88 | +| mistral-7b-instruct-v0.2-hf | 48.84 | +| mixtral-8x7b-instruct-v0.1-hf | 59.64 | + +### Details + +| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects | +|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:| +| qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 | +| qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 | +| qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 | +| qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 | +| qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 | +| qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 | +| qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 | +| qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 | +| internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 | +| internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 | +| internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 | +| internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 | +| internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 | +| internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 | +| llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 | +| llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 | +| llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 | +| llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 | +| mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 | +| mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 | +| mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 | + +| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two | +|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:| +| qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 | +| qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 | +| qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 | +| qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 | +| qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 | +| qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 | +| qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 | +| qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 | +| internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 | +| internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 | +| internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 | +| internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 | +| internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 | +| internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 | +| llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 | +| llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 | +| llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 | +| llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 | +| mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 | +| mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 | +| mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 | + +| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies | +|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:| +| qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 | +| qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 | +| qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 | +| qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 | +| qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 | +| qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 | +| qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 | +| qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 | +| internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 | +| internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 | +| internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 | +| internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 | +| internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 | +| internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 | +| llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 | +| llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 | +| llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 | +| llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 | +| mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 | +| mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 | +| mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 | diff --git a/configs/datasets/ceval/README.md b/configs/datasets/ceval/README.md new file mode 100644 index 000000000..4b49f787f --- /dev/null +++ b/configs/datasets/ceval/README.md @@ -0,0 +1,367 @@ +# C-Eval + +## Base Models + +| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other | +|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:| +| llama-7b-turbomind | 26.61 | 27.75 | 27.20 | 26.31 | 25.90 | 26.52 | 27.44 | 27.68 | 27.16 | 29.49 | 24.18 | 29.36 | +| llama-13b-turbomind | 29.18 | 25.59 | 27.66 | 33.86 | 28.29 | 28.58 | 31.75 | 30.32 | 31.39 | 35.22 | 30.16 | 30.82 | +| llama-30b-turbomind | 35.09 | 31.68 | 34.56 | 39.89 | 33.02 | 33.76 | 37.70 | 31.97 | 34.80 | 42.72 | 41.19 | 34.93 | +| llama-65b-turbomind | 37.98 | 29.47 | 36.03 | 45.03 | 36.51 | 36.56 | 40.46 | 33.76 | 36.37 | 46.47 | 42.26 | 40.63 | +| llama-2-7b-turbomind | 30.13 | 26.26 | 29.29 | 33.02 | 31.02 | 28.15 | 32.70 | 25.85 | 28.75 | 39.75 | 37.04 | 29.13 | +| llama-2-13b-turbomind | 37.38 | 30.81 | 35.85 | 43.98 | 36.81 | 34.75 | 40.43 | 31.34 | 35.67 | 45.75 | 45.32 | 39.36 | +| llama-2-70b-turbomind | 49.53 | 33.48 | 44.73 | 60.19 | 50.93 | 47.17 | 50.26 | 32.53 | 44.83 | 59.44 | 54.45 | 47.58 | +| llama-3-8b-turbomind | 48.83 | 34.47 | 46.02 | 56.48 | 49.15 | 46.69 | 50.45 | 33.76 | 45.94 | 58.08 | 50.93 | 51.25 | +| llama-3-70b-turbomind | 66.56 | 54.09 | 64.08 | 76.43 | 64.38 | 64.25 | 67.30 | 52.35 | 62.67 | 77.89 | 69.76 | 63.65 | +| internlm2-1.8b-turbomind | 44.79 | 33.93 | 41.19 | 54.26 | 47.15 | 40.35 | 46.64 | 33.00 | 38.62 | 57.28 | 51.30 | 46.89 | +| internlm2-7b-turbomind | 63.54 | 45.32 | 58.10 | 76.40 | 66.94 | 58.32 | 64.23 | 40.09 | 54.37 | 76.88 | 70.11 | 64.77 | +| internlm2-20b-turbomind | 67.28 | 50.15 | 62.33 | 79.59 | 70.55 | 61.82 | 66.73 | 42.50 | 59.25 | 79.98 | 73.43 | 61.56 | +| qwen-1.8b-turbomind | 54.24 | 38.60 | 50.02 | 68.18 | 55.33 | 48.13 | 53.78 | 33.38 | 46.36 | 68.40 | 57.57 | 50.17 | +| qwen-7b-turbomind | 62.06 | 42.73 | 56.21 | 77.12 | 65.28 | 55.76 | 63.23 | 36.99 | 54.74 | 78.55 | 68.94 | 59.02 | +| qwen-14b-turbomind | 70.33 | 53.61 | 65.25 | 83.19 | 72.85 | 65.37 | 72.05 | 55.03 | 66.07 | 85.59 | 74.91 | 67.78 | +| qwen-72b-turbomind | 83.25 | 66.78 | 78.44 | 91.75 | 83.86 | 83.63 | 83.60 | 63.68 | 78.05 | 90.25 | 87.13 | 84.13 | +| qwen1.5-0.5b-hf | 48.36 | 35.55 | 44.72 | 62.00 | 48.51 | 42.41 | 50.43 | 37.00 | 46.28 | 62.64 | 48.11 | 49.18 | +| qwen1.5-1.8b-hf | 58.67 | 40.98 | 53.91 | 74.52 | 58.51 | 53.06 | 59.38 | 43.02 | 53.45 | 75.88 | 60.06 | 54.47 | +| qwen1.5-4b-hf | 66.55 | 48.50 | 61.45 | 81.12 | 67.90 | 61.22 | 66.46 | 43.12 | 56.76 | 82.89 | 67.61 | 68.03 | +| qwen1.5-7b-hf | 72.49 | 52.90 | 66.77 | 85.50 | 74.37 | 69.19 | 73.57 | 49.16 | 66.32 | 84.23 | 77.30 | 73.34 | +| qwen1.5-14b-hf | 76.93 | 60.50 | 72.08 | 88.81 | 77.95 | 73.94 | 77.86 | 54.81 | 71.55 | 86.79 | 82.86 | 76.23 | +| qwen1.5-32b-hf | 82.50 | 66.67 | 77.97 | 90.93 | 83.66 | 81.88 | 82.79 | 71.06 | 80.01 | 89.02 | 83.36 | 81.62 | +| qwen1.5-72b-hf | 83.03 | 65.09 | 77.90 | 91.47 | 83.85 | 83.86 | 83.72 | 64.09 | 77.26 | 91.87 | 87.64 | 84.14 | +| qwen1.5-moe-a2-7b-hf | 76.67 | 51.37 | 68.89 | 88.33 | 77.15 | 79.73 | 77.90 | 51.25 | 67.27 | 89.28 | 83.16 | 81.60 | +| mistral-7b-v0.1-hf | 43.76 | 33.85 | 42.23 | 49.97 | 41.10 | 43.54 | 47.54 | 33.97 | 44.74 | 54.80 | 51.52 | 42.06 | +| mistral-7b-v0.2-hf | 42.81 | 32.84 | 41.00 | 50.19 | 39.45 | 42.77 | 46.44 | 31.67 | 42.89 | 54.50 | 48.75 | 43.23 | +| mixtral-8x7b-v0.1-hf | 51.15 | 41.46 | 50.93 | 59.19 | 46.69 | 48.72 | 55.31 | 42.04 | 52.78 | 62.00 | 56.44 | 52.71 | +| mixtral-8x22b-v0.1-hf | 58.13 | 48.31 | 58.01 | 66.94 | 53.60 | 54.86 | 60.50 | 45.67 | 57.44 | 71.27 | 61.31 | 55.47 | +| yi-6b-hf | 70.78 | 43.72 | 60.54 | 83.29 | 75.39 | 73.40 | 73.13 | 46.87 | 63.14 | 85.52 | 78.70 | 74.45 | +| yi-34b-hf | 80.93 | 58.51 | 73.48 | 89.24 | 83.65 | 84.18 | 81.62 | 56.95 | 71.64 | 89.73 | 87.49 | 86.53 | +| deepseek-7b-base-hf | 43.68 | 28.90 | 37.03 | 53.55 | 50.14 | 40.34 | 45.07 | 31.94 | 38.81 | 56.68 | 47.10 | 43.85 | +| deepseek-67b-base-hf | 66.66 | 44.25 | 57.89 | 79.02 | 72.36 | 65.66 | 66.65 | 38.62 | 56.65 | 79.56 | 73.72 | 66.01 | + +### Details on Test Split + +| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics | +|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:| +| llama-7b-turbomind | 29.82 | 25.70 | 26.94 | 30.99 | 32.95 | 23.66 | 26.01 | 22.89 | 27.45 | 30.09 | 26.48 | 33.13 | +| llama-13b-turbomind | 33.33 | 37.99 | 31.09 | 29.82 | 22.16 | 27.23 | 31.79 | 27.11 | 24.84 | 28.02 | 33.33 | 30.72 | +| llama-30b-turbomind | 40.94 | 48.60 | 40.41 | 34.21 | 32.95 | 35.71 | 36.42 | 32.53 | 27.45 | 31.56 | 36.07 | 30.12 | +| llama-65b-turbomind | 41.52 | 50.84 | 44.04 | 40.94 | 27.84 | 29.46 | 28.32 | 30.72 | 29.41 | 35.10 | 42.47 | 30.12 | +| llama-2-7b-turbomind | 33.92 | 37.99 | 34.72 | 30.99 | 26.70 | 21.88 | 31.79 | 25.30 | 24.18 | 31.56 | 39.73 | 30.12 | +| llama-2-13b-turbomind | 40.94 | 46.93 | 37.82 | 36.26 | 30.68 | 29.46 | 35.84 | 30.72 | 24.84 | 32.74 | 42.92 | 34.94 | +| llama-2-70b-turbomind | 55.56 | 58.66 | 53.89 | 47.95 | 34.09 | 33.48 | 32.95 | 27.11 | 34.64 | 37.76 | 57.99 | 29.52 | +| llama-3-8b-turbomind | 55.56 | 58.66 | 55.96 | 51.17 | 27.27 | 35.27 | 36.42 | 31.33 | 34.64 | 40.12 | 50.68 | 30.72 | +| llama-3-70b-turbomind | 69.59 | 75.98 | 69.95 | 71.64 | 49.43 | 58.04 | 52.02 | 53.01 | 58.82 | 45.72 | 68.95 | 40.96 | +| internlm2-1.8b-turbomind | 40.35 | 40.78 | 39.38 | 32.16 | 34.66 | 34.38 | 31.21 | 31.33 | 35.95 | 35.10 | 51.60 | 27.71 | +| internlm2-7b-turbomind | 56.14 | 57.54 | 62.69 | 49.42 | 43.75 | 48.21 | 34.68 | 32.53 | 33.33 | 41.00 | 60.27 | 40.36 | +| internlm2-20b-turbomind | 62.57 | 65.36 | 66.84 | 58.77 | 43.18 | 51.79 | 39.31 | 40.36 | 35.95 | 42.77 | 66.67 | 47.59 | +| qwen-1.8b-turbomind | 46.20 | 41.90 | 46.63 | 36.84 | 40.34 | 36.61 | 27.75 | 28.92 | 32.68 | 36.58 | 57.08 | 30.12 | +| qwen-7b-turbomind | 52.63 | 54.75 | 54.40 | 46.20 | 35.80 | 44.20 | 36.99 | 27.71 | 26.80 | 38.35 | 57.99 | 33.13 | +| qwen-14b-turbomind | 58.48 | 64.80 | 59.07 | 54.68 | 45.45 | 57.59 | 45.09 | 33.73 | 39.22 | 49.26 | 67.58 | 45.78 | +| qwen-72b-turbomind | 83.04 | 73.74 | 79.27 | 76.61 | 75.00 | 64.29 | 49.13 | 44.58 | 46.41 | 66.37 | 85.84 | 68.07 | +| qwen1.5-0.5b-hf | 37.43 | 40.22 | 41.45 | 35.09 | 40.91 | 34.82 | 30.06 | 27.11 | 26.80 | 29.79 | 54.34 | 31.93 | +| qwen1.5-1.8b-hf | 47.37 | 50.84 | 47.67 | 38.30 | 43.18 | 35.27 | 29.48 | 30.12 | 33.99 | 39.53 | 58.90 | 28.92 | +| qwen1.5-4b-hf | 62.57 | 56.98 | 56.99 | 46.78 | 48.30 | 45.98 | 40.46 | 34.34 | 31.37 | 46.61 | 62.10 | 43.37 | +| qwen1.5-7b-hf | 66.08 | 62.57 | 66.32 | 55.56 | 54.55 | 47.77 | 41.62 | 31.93 | 35.95 | 49.85 | 74.43 | 49.40 | +| qwen1.5-14b-hf | 71.35 | 66.48 | 68.39 | 64.91 | 57.95 | 65.62 | 41.62 | 40.36 | 47.71 | 56.64 | 79.45 | 56.63 | +| qwen1.5-32b-hf | 84.80 | 73.18 | 74.61 | 70.18 | 71.59 | 61.61 | 49.13 | 45.78 | 49.02 | 61.95 | 87.67 | 72.89 | +| qwen1.5-72b-hf | 85.38 | 73.74 | 78.24 | 78.36 | 72.73 | 63.39 | 43.35 | 40.96 | 49.02 | 65.78 | 85.84 | 66.27 | +| qwen1.5-moe-a2-7b-hf | 77.78 | 73.74 | 68.91 | 64.91 | 66.48 | 49.11 | 33.53 | 36.75 | 35.95 | 61.06 | 91.32 | 40.96 | +| mistral-7b-v0.1-hf | 55.56 | 55.31 | 56.99 | 48.25 | 39.77 | 39.29 | 33.53 | 25.90 | 31.37 | 35.99 | 45.21 | 27.11 | +| mistral-7b-v0.2-hf | 56.14 | 53.63 | 55.44 | 47.66 | 36.36 | 34.38 | 32.37 | 25.30 | 33.33 | 31.86 | 45.21 | 29.52 | +| mixtral-8x7b-v0.1-hf | 62.57 | 64.80 | 60.10 | 60.53 | 38.64 | 42.41 | 40.46 | 37.35 | 45.75 | 35.99 | 60.27 | 34.94 | +| mixtral-8x22b-v0.1-hf | 65.50 | 74.86 | 63.73 | 65.79 | 46.59 | 52.68 | 52.02 | 45.78 | 52.94 | 42.77 | 62.56 | 39.16 | +| yi-6b-hf | 68.42 | 63.13 | 69.43 | 57.89 | 42.05 | 48.66 | 31.79 | 33.13 | 28.76 | 49.85 | 74.89 | 37.35 | +| yi-34b-hf | 83.63 | 80.45 | 74.09 | 68.42 | 62.50 | 60.27 | 45.09 | 38.55 | 50.33 | 65.19 | 88.58 | 49.40 | +| deepseek-7b-base-hf | 44.44 | 44.13 | 44.56 | 36.26 | 30.68 | 29.02 | 32.37 | 24.70 | 26.14 | 35.99 | 48.86 | 28.31 | +| deepseek-67b-base-hf | 63.16 | 70.39 | 65.80 | 59.36 | 42.61 | 45.54 | 35.84 | 38.55 | 42.48 | 44.54 | 68.95 | 33.73 | + +| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought | +|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:| +| llama-7b-turbomind | 29.14 | 26.74 | 24.57 | 29.94 | 22.92 | 23.60 | 20.00 | 30.95 | 29.98 | 24.58 | 25.70 | 25.11 | +| llama-13b-turbomind | 22.29 | 18.60 | 28.00 | 26.55 | 26.56 | 25.28 | 19.46 | 29.05 | 28.77 | 28.57 | 39.66 | 43.38 | +| llama-30b-turbomind | 25.14 | 33.14 | 36.00 | 31.07 | 39.06 | 28.09 | 33.51 | 38.10 | 35.21 | 35.88 | 48.04 | 33.33 | +| llama-65b-turbomind | 33.71 | 26.16 | 38.29 | 33.90 | 44.27 | 36.52 | 38.92 | 38.10 | 37.42 | 42.19 | 59.22 | 48.40 | +| llama-2-7b-turbomind | 26.86 | 23.26 | 26.86 | 28.81 | 28.12 | 29.78 | 22.70 | 30.48 | 31.79 | 30.56 | 33.52 | 36.07 | +| llama-2-13b-turbomind | 28.00 | 31.98 | 36.57 | 36.72 | 38.54 | 36.52 | 37.84 | 46.67 | 37.02 | 36.54 | 57.54 | 41.10 | +| llama-2-70b-turbomind | 40.00 | 36.05 | 48.00 | 36.72 | 66.67 | 55.06 | 55.68 | 52.86 | 51.91 | 48.50 | 68.16 | 60.73 | +| llama-3-8b-turbomind | 41.71 | 38.37 | 50.86 | 36.16 | 61.98 | 63.48 | 63.78 | 56.19 | 41.65 | 49.17 | 69.27 | 54.34 | +| llama-3-70b-turbomind | 63.43 | 56.98 | 69.14 | 59.32 | 84.90 | 75.28 | 78.92 | 79.52 | 68.81 | 59.80 | 86.59 | 79.91 | +| internlm2-1.8b-turbomind | 30.29 | 45.93 | 46.29 | 33.33 | 63.02 | 60.11 | 62.70 | 47.62 | 35.61 | 37.87 | 69.27 | 61.64 | +| internlm2-7b-turbomind | 64.57 | 65.12 | 76.00 | 54.80 | 91.15 | 85.96 | 90.27 | 74.29 | 57.34 | 50.50 | 86.59 | 83.56 | +| internlm2-20b-turbomind | 68.57 | 74.42 | 78.86 | 58.76 | 91.67 | 90.45 | 90.27 | 72.38 | 57.95 | 55.81 | 88.83 | 88.58 | +| qwen-1.8b-turbomind | 55.43 | 56.98 | 61.14 | 54.80 | 85.42 | 84.83 | 85.41 | 54.76 | 43.06 | 44.19 | 83.80 | 79.91 | +| qwen-7b-turbomind | 68.00 | 69.19 | 82.86 | 57.63 | 93.75 | 87.64 | 92.43 | 63.81 | 47.28 | 57.48 | 86.59 | 82.65 | +| qwen-14b-turbomind | 78.86 | 83.14 | 92.57 | 67.23 | 96.88 | 95.51 | 96.76 | 73.33 | 56.94 | 64.45 | 91.62 | 86.76 | +| qwen-72b-turbomind | 93.14 | 93.60 | 95.43 | 88.70 | 98.44 | 97.75 | 99.46 | 90.00 | 75.45 | 80.73 | 96.09 | 99.54 | +| qwen1.5-0.5b-hf | 48.57 | 44.19 | 60.00 | 40.68 | 73.44 | 69.66 | 78.92 | 49.05 | 34.41 | 40.20 | 79.89 | 74.43 | +| qwen1.5-1.8b-hf | 58.86 | 68.02 | 76.00 | 59.32 | 91.15 | 90.45 | 87.03 | 63.81 | 44.87 | 48.50 | 86.03 | 90.41 | +| qwen1.5-4b-hf | 66.86 | 77.33 | 82.86 | 68.93 | 95.31 | 92.70 | 97.30 | 71.90 | 51.31 | 61.13 | 91.62 | 94.52 | +| qwen1.5-7b-hf | 79.43 | 82.56 | 91.43 | 77.40 | 96.88 | 95.51 | 96.22 | 80.00 | 62.37 | 69.77 | 93.30 | 97.26 | +| qwen1.5-14b-hf | 86.29 | 87.79 | 93.14 | 83.05 | 97.92 | 95.51 | 97.84 | 82.86 | 63.78 | 77.08 | 95.53 | 96.35 | +| qwen1.5-32b-hf | 88.00 | 95.35 | 94.86 | 91.53 | 97.92 | 99.44 | 100.00 | 90.00 | 73.44 | 78.74 | 94.97 | 98.63 | +| qwen1.5-72b-hf | 91.43 | 93.60 | 95.43 | 88.70 | 97.92 | 98.31 | 99.46 | 90.00 | 74.25 | 80.40 | 94.41 | 98.63 | +| qwen1.5-moe-a2-7b-hf | 70.86 | 77.33 | 82.86 | 68.36 | 97.92 | 93.26 | 97.30 | 89.52 | 70.22 | 74.75 | 96.09 | 98.17 | +| mistral-7b-v0.1-hf | 33.14 | 40.70 | 40.57 | 40.11 | 47.92 | 49.44 | 50.81 | 47.62 | 44.87 | 37.87 | 58.10 | 48.40 | +| mistral-7b-v0.2-hf | 34.86 | 36.63 | 45.71 | 36.72 | 46.35 | 46.07 | 48.65 | 43.81 | 43.46 | 39.53 | 57.54 | 48.86 | +| mixtral-8x7b-v0.1-hf | 49.71 | 42.44 | 53.71 | 47.46 | 62.50 | 61.24 | 60.00 | 57.62 | 52.52 | 44.52 | 68.72 | 57.99 | +| mixtral-8x22b-v0.1-hf | 54.29 | 43.02 | 58.29 | 55.93 | 76.04 | 66.29 | 75.68 | 66.19 | 60.97 | 51.83 | 74.30 | 70.78 | +| yi-6b-hf | 58.86 | 69.19 | 78.29 | 43.50 | 92.19 | 89.33 | 90.27 | 83.81 | 59.56 | 70.10 | 93.85 | 97.72 | +| yi-34b-hf | 80.00 | 81.98 | 93.14 | 65.54 | 97.40 | 95.51 | 96.76 | 92.86 | 74.04 | 76.08 | 94.97 | 97.26 | +| deepseek-7b-base-hf | 29.14 | 30.81 | 33.14 | 24.29 | 53.12 | 45.51 | 48.65 | 50.48 | 38.23 | 44.19 | 62.01 | 65.30 | +| deepseek-67b-base-hf | 60.00 | 55.23 | 64.00 | 46.33 | 84.90 | 79.78 | 83.24 | 73.33 | 57.75 | 63.79 | 89.94 | 88.58 | + +| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies | +|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:| +| llama-7b-turbomind | 22.96 | 31.58 | 25.57 | 29.78 | 22.80 | 25.00 | 21.70 | 21.51 | 25.00 | 26.24 | 22.49 | 25.84 | +| llama-13b-turbomind | 29.26 | 30.83 | 33.52 | 36.52 | 34.72 | 33.33 | 24.06 | 40.12 | 26.47 | 33.48 | 30.14 | 29.87 | +| llama-30b-turbomind | 37.41 | 46.37 | 32.95 | 38.20 | 50.78 | 40.74 | 28.77 | 45.93 | 33.33 | 32.13 | 39.23 | 22.82 | +| llama-65b-turbomind | 39.63 | 51.13 | 31.82 | 39.89 | 58.03 | 42.59 | 34.91 | 55.23 | 39.71 | 30.32 | 37.80 | 32.89 | +| llama-2-7b-turbomind | 27.78 | 34.34 | 31.82 | 34.83 | 35.23 | 34.26 | 28.77 | 38.95 | 32.35 | 33.94 | 27.27 | 30.87 | +| llama-2-13b-turbomind | 41.48 | 47.37 | 37.50 | 37.64 | 50.78 | 52.78 | 43.40 | 48.84 | 32.35 | 38.46 | 36.36 | 30.20 | +| llama-2-70b-turbomind | 57.78 | 69.17 | 50.57 | 58.43 | 69.95 | 66.67 | 50.94 | 72.09 | 50.98 | 42.53 | 44.98 | 52.01 | +| llama-3-8b-turbomind | 56.30 | 65.41 | 47.16 | 56.18 | 64.25 | 61.11 | 55.66 | 67.44 | 41.67 | 40.27 | 45.45 | 50.34 | +| llama-3-70b-turbomind | 72.22 | 85.46 | 75.00 | 74.72 | 84.97 | 76.85 | 75.00 | 76.16 | 59.31 | 52.94 | 62.68 | 68.46 | +| internlm2-1.8b-turbomind | 47.41 | 61.40 | 55.11 | 47.75 | 61.66 | 64.81 | 61.79 | 63.95 | 32.35 | 32.58 | 48.33 | 36.58 | +| internlm2-7b-turbomind | 66.67 | 85.96 | 78.98 | 74.72 | 91.71 | 87.96 | 80.66 | 80.23 | 42.16 | 50.23 | 64.11 | 70.13 | +| internlm2-20b-turbomind | 69.26 | 89.22 | 83.52 | 80.34 | 90.67 | 91.67 | 83.02 | 85.47 | 49.02 | 54.30 | 72.25 | 73.15 | +| qwen-1.8b-turbomind | 51.11 | 70.68 | 71.02 | 62.36 | 88.60 | 87.04 | 69.81 | 73.26 | 29.90 | 46.15 | 50.24 | 47.32 | +| qwen-7b-turbomind | 57.41 | 83.71 | 88.64 | 79.78 | 93.26 | 94.44 | 75.47 | 79.07 | 42.16 | 47.96 | 59.33 | 65.10 | +| qwen-14b-turbomind | 72.96 | 89.97 | 93.75 | 83.71 | 96.37 | 95.37 | 86.32 | 87.21 | 50.00 | 60.63 | 66.99 | 72.48 | +| qwen-72b-turbomind | 85.56 | 96.24 | 95.45 | 93.26 | 97.93 | 97.22 | 92.45 | 91.86 | 67.65 | 76.92 | 75.12 | 83.89 | +| qwen1.5-0.5b-hf | 43.33 | 63.16 | 65.91 | 56.18 | 82.90 | 79.63 | 68.87 | 70.35 | 28.43 | 37.56 | 39.23 | 32.21 | +| qwen1.5-1.8b-hf | 57.41 | 76.44 | 81.25 | 75.84 | 92.75 | 91.67 | 79.72 | 81.98 | 34.31 | 47.96 | 47.85 | 43.62 | +| qwen1.5-4b-hf | 65.93 | 87.47 | 86.93 | 82.58 | 94.30 | 95.37 | 84.91 | 84.30 | 40.20 | 62.90 | 58.85 | 58.72 | +| qwen1.5-7b-hf | 69.26 | 91.98 | 90.91 | 89.89 | 95.85 | 94.44 | 89.15 | 87.21 | 48.04 | 67.87 | 63.16 | 68.12 | +| qwen1.5-14b-hf | 78.89 | 94.99 | 94.89 | 91.57 | 96.89 | 98.15 | 91.04 | 88.37 | 57.84 | 69.68 | 66.99 | 73.83 | +| qwen1.5-32b-hf | 83.70 | 95.99 | 93.75 | 94.38 | 98.45 | 97.22 | 90.57 | 91.28 | 70.10 | 76.92 | 76.56 | 80.87 | +| qwen1.5-72b-hf | 84.44 | 96.49 | 96.59 | 93.82 | 98.45 | 97.22 | 92.92 | 91.28 | 66.67 | 76.92 | 74.16 | 85.23 | +| qwen1.5-moe-a2-7b-hf | 80.74 | 95.49 | 89.20 | 89.33 | 94.82 | 94.44 | 92.45 | 91.28 | 52.45 | 75.57 | 67.94 | 79.87 | +| mistral-7b-v0.1-hf | 45.19 | 59.15 | 43.75 | 49.44 | 56.48 | 56.48 | 45.28 | 58.14 | 37.75 | 38.91 | 40.67 | 34.56 | +| mistral-7b-v0.2-hf | 45.93 | 58.65 | 38.07 | 48.31 | 63.21 | 58.33 | 41.98 | 54.07 | 35.78 | 40.27 | 38.28 | 32.21 | +| mixtral-8x7b-v0.1-hf | 57.04 | 67.92 | 53.41 | 55.06 | 69.95 | 64.81 | 47.64 | 70.93 | 42.16 | 38.01 | 46.41 | 36.58 | +| mixtral-8x22b-v0.1-hf | 60.37 | 72.68 | 64.77 | 65.17 | 77.20 | 71.30 | 57.08 | 75.00 | 49.51 | 43.44 | 52.63 | 49.33 | +| yi-6b-hf | 79.26 | 92.48 | 77.27 | 76.40 | 92.75 | 93.52 | 89.15 | 90.12 | 60.78 | 74.66 | 61.24 | 74.16 | +| yi-34b-hf | 84.81 | 96.24 | 88.07 | 88.20 | 96.37 | 96.30 | 91.98 | 91.28 | 75.00 | 78.73 | 80.38 | 82.89 | +| deepseek-7b-base-hf | 52.22 | 70.18 | 47.16 | 51.12 | 60.62 | 44.44 | 58.49 | 66.86 | 31.86 | 37.56 | 53.11 | 61.07 | +| deepseek-67b-base-hf | 76.67 | 89.22 | 77.27 | 78.65 | 89.64 | 78.70 | 85.85 | 84.30 | 50.00 | 64.25 | 69.38 | 84.23 | + +| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant | +|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:| +| llama-7b-turbomind | 29.70 | 23.72 | 27.53 | 30.22 | 30.92 | 27.04 | 22.78 | 28.64 | 28.00 | 25.00 | 26.32 | 29.80 | +| llama-13b-turbomind | 25.94 | 20.93 | 25.84 | 29.67 | 24.64 | 29.60 | 26.67 | 29.15 | 33.71 | 25.50 | 28.47 | 28.44 | +| llama-30b-turbomind | 29.32 | 27.91 | 30.34 | 36.26 | 37.20 | 36.13 | 36.11 | 38.69 | 34.29 | 29.50 | 38.52 | 29.35 | +| llama-65b-turbomind | 28.95 | 30.70 | 30.90 | 44.51 | 35.75 | 36.60 | 45.56 | 39.20 | 37.71 | 30.00 | 39.47 | 37.02 | +| llama-2-7b-turbomind | 29.70 | 30.23 | 24.72 | 29.67 | 34.78 | 30.07 | 31.11 | 31.16 | 30.29 | 25.50 | 31.34 | 27.31 | +| llama-2-13b-turbomind | 30.83 | 32.56 | 24.16 | 42.31 | 45.41 | 32.87 | 36.67 | 45.23 | 38.29 | 33.50 | 35.17 | 34.31 | +| llama-2-70b-turbomind | 53.76 | 38.14 | 30.34 | 58.79 | 65.70 | 43.82 | 51.11 | 58.29 | 49.71 | 42.00 | 49.76 | 46.28 | +| llama-3-8b-turbomind | 52.63 | 42.33 | 27.53 | 51.65 | 65.70 | 44.52 | 54.44 | 51.26 | 46.86 | 43.00 | 46.41 | 45.15 | +| llama-3-70b-turbomind | 72.93 | 52.56 | 32.58 | 71.98 | 83.57 | 56.88 | 69.44 | 78.89 | 76.00 | 67.50 | 57.89 | 59.14 | +| internlm2-1.8b-turbomind | 51.50 | 38.14 | 25.84 | 56.04 | 71.50 | 47.32 | 35.00 | 43.72 | 42.29 | 39.00 | 41.15 | 36.57 | +| internlm2-7b-turbomind | 72.56 | 53.49 | 52.25 | 79.67 | 90.82 | 62.00 | 62.78 | 64.32 | 66.86 | 59.50 | 55.74 | 53.50 | +| internlm2-20b-turbomind | 74.06 | 54.42 | 56.18 | 81.87 | 92.27 | 61.77 | 68.33 | 69.85 | 68.00 | 63.50 | 60.77 | 58.92 | +| qwen-1.8b-turbomind | 54.14 | 43.72 | 39.89 | 69.23 | 85.02 | 49.88 | 45.56 | 48.74 | 48.57 | 51.50 | 46.89 | 45.82 | +| qwen-7b-turbomind | 71.05 | 48.37 | 53.93 | 81.87 | 93.72 | 59.67 | 54.44 | 62.31 | 58.29 | 57.50 | 50.24 | 56.66 | +| qwen-14b-turbomind | 79.70 | 53.02 | 63.48 | 87.36 | 94.20 | 71.33 | 63.33 | 71.36 | 73.14 | 68.00 | 59.09 | 67.95 | +| qwen-72b-turbomind | 90.23 | 77.21 | 79.21 | 91.76 | 96.14 | 77.86 | 86.11 | 85.43 | 91.43 | 90.50 | 76.08 | 86.68 | +| qwen1.5-0.5b-hf | 44.36 | 36.74 | 39.33 | 58.24 | 78.26 | 43.36 | 40.00 | 45.23 | 41.71 | 42.50 | 43.54 | 43.12 | +| qwen1.5-1.8b-hf | 59.40 | 47.91 | 37.08 | 72.53 | 91.30 | 53.61 | 53.33 | 51.26 | 49.71 | 58.00 | 51.20 | 56.21 | +| qwen1.5-4b-hf | 65.04 | 58.60 | 55.62 | 83.52 | 94.20 | 62.00 | 63.89 | 65.33 | 65.71 | 64.00 | 55.26 | 61.40 | +| qwen1.5-7b-hf | 78.57 | 66.51 | 66.85 | 87.91 | 94.69 | 68.07 | 65.00 | 64.82 | 77.14 | 77.50 | 60.77 | 74.49 | +| qwen1.5-14b-hf | 83.08 | 72.09 | 70.22 | 90.11 | 94.20 | 69.46 | 73.89 | 70.35 | 82.29 | 83.00 | 65.31 | 78.33 | +| qwen1.5-32b-hf | 87.59 | 78.14 | 79.78 | 92.86 | 95.65 | 78.32 | 80.56 | 79.90 | 90.29 | 89.00 | 77.27 | 86.68 | +| qwen1.5-72b-hf | 91.35 | 76.74 | 79.21 | 91.76 | 96.14 | 79.25 | 85.56 | 86.93 | 92.00 | 90.00 | 75.84 | 86.91 | +| qwen1.5-moe-a2-7b-hf | 88.35 | 75.81 | 51.12 | 79.12 | 94.69 | 67.37 | 80.56 | 73.37 | 87.43 | 84.00 | 78.23 | 82.39 | +| mistral-7b-v0.1-hf | 40.23 | 39.07 | 24.16 | 41.21 | 52.17 | 41.49 | 45.00 | 52.26 | 45.14 | 42.00 | 42.58 | 44.02 | +| mistral-7b-v0.2-hf | 36.84 | 34.88 | 23.03 | 43.96 | 52.66 | 40.79 | 50.00 | 50.75 | 45.14 | 40.50 | 42.58 | 40.86 | +| mixtral-8x7b-v0.1-hf | 47.74 | 40.00 | 28.09 | 57.14 | 58.94 | 44.29 | 58.33 | 53.77 | 48.57 | 46.00 | 51.20 | 46.50 | +| mixtral-8x22b-v0.1-hf | 59.02 | 41.86 | 29.78 | 60.99 | 71.01 | 50.82 | 57.78 | 67.34 | 62.29 | 52.00 | 53.35 | 55.98 | +| yi-6b-hf | 85.34 | 67.91 | 53.93 | 80.22 | 91.79 | 65.97 | 72.22 | 72.36 | 82.29 | 84.50 | 69.86 | 71.56 | +| yi-34b-hf | 94.36 | 76.74 | 65.73 | 87.91 | 95.17 | 79.25 | 85.56 | 90.95 | 90.86 | 92.00 | 76.79 | 82.39 | +| deepseek-7b-base-hf | 65.79 | 29.30 | 32.58 | 47.80 | 67.15 | 37.76 | 44.44 | 52.26 | 43.43 | 36.50 | 41.15 | 37.02 | +| deepseek-67b-base-hf | 83.83 | 58.60 | 45.51 | 79.67 | 90.34 | 62.47 | 70.56 | 70.85 | 81.14 | 71.50 | 61.72 | 60.05 | + +| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician | +|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:| +| llama-7b-turbomind | 22.34 | 24.91 | 29.12 | 27.77 | +| llama-13b-turbomind | 24.11 | 30.25 | 27.77 | 30.70 | +| llama-30b-turbomind | 28.72 | 31.67 | 31.83 | 36.57 | +| llama-65b-turbomind | 28.37 | 39.15 | 33.63 | 35.44 | +| llama-2-7b-turbomind | 22.70 | 24.91 | 25.51 | 29.80 | +| llama-2-13b-turbomind | 25.53 | 35.94 | 29.35 | 35.44 | +| llama-2-70b-turbomind | 36.52 | 52.67 | 36.12 | 52.60 | +| llama-3-8b-turbomind | 35.46 | 49.82 | 41.31 | 55.30 | +| llama-3-70b-turbomind | 48.58 | 64.41 | 52.60 | 75.40 | +| internlm2-1.8b-turbomind | 32.27 | 42.35 | 39.05 | 45.15 | +| internlm2-7b-turbomind | 46.81 | 55.16 | 47.63 | 67.27 | +| internlm2-20b-turbomind | 45.04 | 62.63 | 51.47 | 69.75 | +| qwen-1.8b-turbomind | 41.84 | 47.69 | 45.60 | 57.34 | +| qwen-7b-turbomind | 41.84 | 54.80 | 48.08 | 69.53 | +| qwen-14b-turbomind | 45.74 | 64.77 | 56.43 | 77.88 | +| qwen-72b-turbomind | 80.50 | 74.73 | 81.04 | 89.62 | +| qwen1.5-0.5b-hf | 39.36 | 41.28 | 38.37 | 48.08 | +| qwen1.5-1.8b-hf | 45.74 | 49.47 | 51.69 | 63.43 | +| qwen1.5-4b-hf | 50.35 | 51.60 | 58.69 | 75.17 | +| qwen1.5-7b-hf | 58.51 | 65.84 | 67.04 | 81.94 | +| qwen1.5-14b-hf | 63.83 | 67.26 | 72.23 | 87.36 | +| qwen1.5-32b-hf | 74.47 | 73.31 | 80.14 | 90.74 | +| qwen1.5-72b-hf | 79.79 | 75.09 | 81.04 | 90.07 | +| qwen1.5-moe-a2-7b-hf | 74.82 | 77.58 | 79.68 | 91.65 | +| mistral-7b-v0.1-hf | 32.27 | 45.91 | 37.70 | 50.56 | +| mistral-7b-v0.2-hf | 32.62 | 44.13 | 36.79 | 46.28 | +| mixtral-8x7b-v0.1-hf | 35.11 | 53.02 | 46.73 | 52.37 | +| mixtral-8x22b-v0.1-hf | 38.65 | 56.23 | 49.21 | 59.82 | +| yi-6b-hf | 67.38 | 68.68 | 69.53 | 83.07 | +| yi-34b-hf | 77.66 | 83.27 | 77.43 | 89.84 | +| deepseek-7b-base-hf | 30.50 | 38.79 | 35.67 | 46.28 | +| deepseek-67b-base-hf | 46.81 | 65.12 | 54.40 | 77.65 | + +### Details on Dev Split + +## Chat Models + +| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other | +|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:| +| qwen1.5-0.5b-chat-hf | 36.88 | 28.83 | 34.49 | 43.46 | 37.35 | 34.76 | 38.58 | 33.90 | 33.63 | 43.81 | 41.79 | 39.59 | +| qwen1.5-1.8b-chat-hf | 55.17 | 38.21 | 50.63 | 70.26 | 56.04 | 48.82 | 55.93 | 37.60 | 50.31 | 67.59 | 60.90 | 50.59 | +| qwen1.5-4b-chat-hf | 61.54 | 44.79 | 56.86 | 75.84 | 62.13 | 56.46 | 62.76 | 38.32 | 55.39 | 79.53 | 65.67 | 58.00 | +| qwen1.5-7b-chat-hf | 68.71 | 51.77 | 64.27 | 81.23 | 68.22 | 65.88 | 71.10 | 50.13 | 65.42 | 83.99 | 73.77 | 67.02 | +| qwen1.5-14b-chat-hf | 74.80 | 56.54 | 69.46 | 87.47 | 76.46 | 71.32 | 76.35 | 52.08 | 69.68 | 86.70 | 80.56 | 74.87 | +| qwen1.5-32b-chat-hf | 80.47 | 63.17 | 75.66 | 89.58 | 81.98 | 79.43 | 81.27 | 63.51 | 76.64 | 89.39 | 82.97 | 80.59 | +| qwen1.5-72b-chat-hf | 81.53 | 63.62 | 75.86 | 90.74 | 83.18 | 81.84 | 82.88 | 62.44 | 77.54 | 89.80 | 86.11 | 83.07 | +| qwen1.5-110b-chat-hf | 87.33 | 67.27 | 80.70 | 93.58 | 89.67 | 91.35 | 87.59 | 73.64 | 81.94 | 91.47 | 92.12 | 89.80 | +| internlm2-chat-1.8b-hf | 47.04 | 34.81 | 43.28 | 59.34 | 48.24 | 41.50 | 48.51 | 36.75 | 42.23 | 57.79 | 54.83 | 45.15 | +| internlm2-chat-1.8b-sft-hf | 47.19 | 35.34 | 43.49 | 59.56 | 48.30 | 41.58 | 48.75 | 35.83 | 42.04 | 59.80 | 54.84 | 44.83 | +| internlm2-chat-7b-hf | 58.75 | 39.61 | 52.38 | 71.46 | 61.57 | 55.96 | 61.04 | 36.56 | 51.81 | 74.01 | 69.13 | 57.92 | +| internlm2-chat-7b-sft-hf | 58.96 | 40.09 | 52.40 | 71.49 | 62.20 | 56.26 | 61.02 | 37.29 | 52.60 | 74.01 | 68.27 | 57.27 | +| internlm2-chat-20b-hf | 63.12 | 42.65 | 56.21 | 75.64 | 67.15 | 60.27 | 63.45 | 34.96 | 52.84 | 79.27 | 71.50 | 60.32 | +| internlm2-chat-20b-sft-hf | 63.16 | 42.70 | 56.19 | 75.74 | 67.20 | 60.37 | 63.54 | 34.96 | 52.57 | 80.33 | 71.42 | 60.34 | +| llama-3-8b-instruct-hf | 50.90 | 34.54 | 46.73 | 58.73 | 49.24 | 53.04 | 52.55 | 36.37 | 48.47 | 58.03 | 53.26 | 54.26 | +| llama-3-70b-instruct-hf | 67.38 | 54.02 | 65.16 | 76.83 | 62.29 | 67.92 | 67.92 | 54.50 | 66.85 | 76.80 | 65.98 | 63.72 | +| llama-3-8b-instruct-lmdeploy | 49.92 | 34.75 | 46.19 | 58.49 | 47.68 | 51.14 | 50.27 | 33.32 | 46.25 | 56.93 | 49.02 | 52.76 | +| llama-3-70b-instruct-lmdeploy | 66.41 | 52.76 | 64.72 | 75.31 | 61.36 | 66.44 | 68.21 | 52.28 | 65.86 | 75.06 | 68.37 | 66.09 | +| mistral-7b-instruct-v0.1-hf | 36.76 | 27.76 | 35.55 | 42.41 | 34.45 | 36.12 | 40.04 | 30.21 | 35.77 | 45.15 | 40.99 | 42.22 | +| mistral-7b-instruct-v0.2-hf | 40.38 | 30.26 | 38.82 | 47.66 | 37.08 | 39.91 | 43.00 | 25.97 | 38.60 | 47.44 | 48.15 | 41.82 | +| mixtral-8x7b-instruct-v0.1-hf | 49.61 | 37.78 | 47.86 | 58.56 | 46.40 | 47.85 | 51.68 | 37.41 | 49.14 | 59.79 | 52.97 | 47.65 | + +### Details on Test Split + +| model | computer_network | operating_system | computer_architecture | college_programming | college_physics | college_chemistry | advanced_mathematics | probability_and_statistics | discrete_mathematics | electrical_engineer | metrology_engineer | high_school_mathematics | +|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:| +| qwen1.5-0.5b-chat-hf | 35.67 | 36.87 | 33.68 | 33.92 | 35.23 | 28.12 | 27.17 | 26.51 | 24.84 | 28.91 | 40.18 | 25.90 | +| qwen1.5-1.8b-chat-hf | 46.78 | 47.49 | 50.78 | 39.18 | 41.48 | 31.25 | 32.95 | 27.71 | 28.10 | 34.81 | 55.71 | 27.11 | +| qwen1.5-4b-chat-hf | 54.39 | 54.75 | 54.92 | 44.74 | 46.02 | 43.30 | 39.31 | 31.33 | 28.10 | 45.13 | 58.90 | 43.98 | +| qwen1.5-7b-chat-hf | 60.82 | 60.34 | 63.21 | 55.85 | 48.86 | 45.09 | 46.24 | 36.14 | 39.22 | 47.49 | 70.32 | 45.78 | +| qwen1.5-14b-chat-hf | 69.59 | 62.57 | 64.77 | 64.91 | 55.68 | 57.14 | 49.13 | 32.53 | 43.14 | 55.16 | 76.71 | 46.99 | +| qwen1.5-32b-chat-hf | 81.87 | 74.30 | 73.58 | 71.35 | 63.07 | 60.71 | 50.87 | 46.99 | 47.06 | 59.29 | 83.11 | 60.84 | +| qwen1.5-72b-chat-hf | 77.78 | 75.42 | 76.17 | 73.39 | 63.64 | 62.50 | 45.09 | 45.78 | 48.37 | 59.00 | 81.74 | 60.84 | +| qwen1.5-110b-chat-hf | 83.63 | 86.03 | 81.87 | 77.49 | 76.70 | 67.86 | 49.13 | 47.59 | 55.56 | 79.94 | 95.89 | 62.05 | +| internlm2-chat-1.8b-hf | 42.11 | 43.58 | 44.56 | 35.38 | 32.95 | 34.82 | 32.95 | 28.92 | 32.68 | 34.22 | 53.42 | 31.93 | +| internlm2-chat-1.8b-sft-hf | 42.11 | 44.13 | 43.01 | 35.09 | 34.09 | 36.16 | 32.95 | 27.11 | 33.33 | 35.10 | 51.14 | 33.13 | +| internlm2-chat-7b-hf | 59.65 | 60.89 | 58.03 | 51.46 | 36.93 | 43.75 | 36.99 | 29.52 | 36.60 | 39.82 | 63.47 | 38.55 | +| internlm2-chat-7b-sft-hf | 59.06 | 61.45 | 56.48 | 52.63 | 39.77 | 41.52 | 36.99 | 27.71 | 39.22 | 40.12 | 62.10 | 40.36 | +| internlm2-chat-20b-hf | 61.99 | 70.39 | 63.73 | 54.97 | 33.52 | 47.77 | 43.93 | 40.96 | 44.44 | 44.25 | 61.64 | 34.34 | +| internlm2-chat-20b-sft-hf | 61.40 | 70.39 | 63.21 | 54.97 | 32.95 | 47.77 | 42.20 | 42.17 | 43.14 | 44.25 | 61.64 | 32.53 | +| llama-3-8b-instruct-hf | 57.31 | 58.10 | 57.51 | 51.17 | 28.41 | 35.27 | 39.31 | 32.53 | 35.29 | 38.05 | 55.25 | 27.11 | +| llama-3-70b-instruct-hf | 71.93 | 74.86 | 70.98 | 67.54 | 50.57 | 57.14 | 52.60 | 53.01 | 56.21 | 47.79 | 68.95 | 43.98 | +| llama-3-8b-instruct-lmdeploy | 55.56 | 57.54 | 55.44 | 48.25 | 30.11 | 33.04 | 35.84 | 31.33 | 33.33 | 38.94 | 53.88 | 31.93 | +| llama-3-70b-instruct-lmdeploy | 70.76 | 77.09 | 69.95 | 67.84 | 49.43 | 54.02 | 50.87 | 54.22 | 56.21 | 47.20 | 69.86 | 42.17 | +| mistral-7b-instruct-v0.1-hf | 49.12 | 47.49 | 43.52 | 39.18 | 32.39 | 28.57 | 29.48 | 24.10 | 28.10 | 37.46 | 44.29 | 23.49 | +| mistral-7b-instruct-v0.2-hf | 47.95 | 53.07 | 52.85 | 42.69 | 28.41 | 26.79 | 40.46 | 30.12 | 29.41 | 33.33 | 42.92 | 24.10 | +| mixtral-8x7b-instruct-v0.1-hf | 58.48 | 62.57 | 58.03 | 56.43 | 38.64 | 36.16 | 39.31 | 34.94 | 37.91 | 34.81 | 55.71 | 28.31 | + +| model | high_school_physics | high_school_chemistry | high_school_biology | middle_school_mathematics | middle_school_biology | middle_school_physics | middle_school_chemistry | veterinary_medicine | college_economics | business_administration | marxism | mao_zedong_thought | +|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:| +| qwen1.5-0.5b-chat-hf | 30.86 | 31.98 | 44.00 | 27.68 | 47.40 | 40.45 | 55.14 | 35.24 | 32.80 | 30.56 | 58.66 | 57.53 | +| qwen1.5-1.8b-chat-hf | 54.86 | 62.21 | 69.14 | 53.67 | 82.81 | 83.15 | 85.41 | 58.10 | 44.06 | 49.83 | 82.12 | 82.65 | +| qwen1.5-4b-chat-hf | 58.86 | 67.44 | 80.00 | 55.93 | 89.58 | 88.20 | 88.11 | 64.29 | 47.08 | 57.48 | 86.59 | 84.93 | +| qwen1.5-7b-chat-hf | 72.00 | 80.81 | 84.00 | 70.06 | 95.31 | 94.94 | 95.14 | 73.81 | 56.94 | 66.11 | 91.62 | 89.04 | +| qwen1.5-14b-chat-hf | 84.00 | 83.72 | 90.29 | 80.23 | 97.92 | 94.94 | 98.38 | 81.43 | 63.18 | 74.75 | 93.30 | 96.80 | +| qwen1.5-32b-chat-hf | 85.71 | 90.12 | 93.71 | 85.31 | 97.92 | 98.31 | 100.00 | 89.05 | 69.82 | 75.75 | 93.85 | 97.72 | +| qwen1.5-72b-chat-hf | 88.57 | 94.19 | 94.86 | 85.31 | 97.92 | 97.75 | 98.38 | 90.48 | 71.63 | 79.73 | 93.85 | 97.72 | +| qwen1.5-110b-chat-hf | 86.86 | 92.44 | 94.29 | 85.31 | 98.44 | 98.88 | 98.92 | 95.24 | 78.87 | 86.38 | 95.53 | 99.54 | +| internlm2-chat-1.8b-hf | 35.43 | 48.84 | 52.00 | 35.03 | 70.31 | 67.98 | 67.03 | 41.43 | 37.83 | 36.88 | 70.95 | 60.73 | +| internlm2-chat-1.8b-sft-hf | 37.71 | 48.26 | 53.14 | 34.46 | 71.35 | 67.98 | 67.57 | 41.90 | 38.63 | 37.54 | 72.63 | 60.27 | +| internlm2-chat-7b-hf | 46.29 | 48.26 | 60.57 | 46.89 | 78.65 | 71.91 | 71.35 | 68.10 | 50.30 | 50.83 | 77.09 | 76.26 | +| internlm2-chat-7b-sft-hf | 46.86 | 48.26 | 61.14 | 45.76 | 77.60 | 71.91 | 71.35 | 67.62 | 50.10 | 50.50 | 77.09 | 75.80 | +| internlm2-chat-20b-hf | 49.71 | 46.51 | 63.43 | 55.37 | 80.73 | 74.72 | 79.46 | 72.38 | 55.73 | 59.80 | 85.47 | 76.26 | +| internlm2-chat-20b-sft-hf | 53.71 | 47.09 | 64.00 | 55.37 | 80.73 | 73.60 | 78.92 | 73.81 | 55.53 | 60.13 | 85.47 | 75.80 | +| llama-3-8b-instruct-hf | 38.86 | 39.53 | 50.29 | 40.11 | 65.10 | 60.11 | 63.78 | 61.43 | 47.89 | 45.85 | 69.27 | 56.16 | +| llama-3-70b-instruct-hf | 63.43 | 55.23 | 69.71 | 68.36 | 85.42 | 80.90 | 78.38 | 86.19 | 69.01 | 65.12 | 83.24 | 82.65 | +| llama-3-8b-instruct-lmdeploy | 41.71 | 40.70 | 52.00 | 41.24 | 61.46 | 58.43 | 65.41 | 57.62 | 45.27 | 46.18 | 69.27 | 55.71 | +| llama-3-70b-instruct-lmdeploy | 61.71 | 53.49 | 70.86 | 64.97 | 88.02 | 83.71 | 77.30 | 84.76 | 68.21 | 60.80 | 80.45 | 79.91 | +| mistral-7b-instruct-v0.1-hf | 27.43 | 28.49 | 36.00 | 28.25 | 40.10 | 42.70 | 43.78 | 37.14 | 32.80 | 37.87 | 41.90 | 48.86 | +| mistral-7b-instruct-v0.2-hf | 33.14 | 29.65 | 44.00 | 31.07 | 47.92 | 44.94 | 49.19 | 44.29 | 37.02 | 40.86 | 53.63 | 48.40 | +| mixtral-8x7b-instruct-v0.1-hf | 46.29 | 40.70 | 54.86 | 42.37 | 58.85 | 60.67 | 57.84 | 54.29 | 50.10 | 46.51 | 69.27 | 52.51 | + +| model | education_science | teacher_qualification | high_school_politics | high_school_geography | middle_school_politics | middle_school_geography | modern_chinese_history | ideological_and_moral_cultivation | logic | law | chinese_language_and_literature | art_studies | +|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:| +| qwen1.5-0.5b-chat-hf | 33.33 | 46.12 | 37.50 | 37.08 | 57.51 | 43.52 | 42.45 | 51.74 | 32.84 | 31.22 | 37.32 | 24.50 | +| qwen1.5-1.8b-chat-hf | 54.07 | 72.43 | 74.43 | 66.85 | 89.12 | 87.04 | 77.36 | 76.16 | 38.24 | 44.34 | 46.89 | 40.94 | +| qwen1.5-4b-chat-hf | 60.00 | 84.71 | 82.39 | 69.66 | 94.82 | 90.74 | 79.72 | 78.49 | 41.67 | 57.47 | 54.07 | 56.38 | +| qwen1.5-7b-chat-hf | 66.30 | 90.73 | 84.66 | 80.90 | 94.30 | 91.67 | 82.55 | 84.88 | 38.73 | 60.18 | 60.77 | 63.42 | +| qwen1.5-14b-chat-hf | 74.81 | 93.73 | 90.91 | 92.13 | 96.89 | 98.15 | 89.62 | 88.37 | 54.41 | 70.14 | 69.86 | 69.13 | +| qwen1.5-32b-chat-hf | 80.37 | 94.49 | 93.75 | 94.94 | 97.93 | 97.22 | 90.09 | 90.70 | 68.63 | 78.73 | 73.21 | 77.52 | +| qwen1.5-72b-chat-hf | 84.07 | 96.74 | 95.45 | 94.94 | 97.93 | 95.37 | 92.92 | 91.28 | 63.73 | 80.09 | 73.68 | 83.89 | +| qwen1.5-110b-chat-hf | 90.37 | 96.99 | 96.02 | 95.51 | 98.45 | 98.15 | 93.87 | 94.19 | 81.37 | 86.88 | 84.69 | 90.94 | +| internlm2-chat-1.8b-hf | 48.15 | 65.41 | 69.32 | 54.49 | 79.27 | 70.37 | 60.85 | 64.53 | 32.35 | 32.58 | 45.45 | 40.60 | +| internlm2-chat-1.8b-sft-hf | 48.15 | 64.91 | 69.89 | 53.93 | 79.27 | 70.37 | 61.32 | 63.95 | 33.82 | 29.86 | 45.45 | 39.93 | +| internlm2-chat-7b-hf | 66.67 | 85.21 | 73.30 | 66.85 | 91.19 | 76.85 | 70.28 | 75.58 | 42.16 | 50.68 | 60.77 | 70.47 | +| internlm2-chat-7b-sft-hf | 67.04 | 85.21 | 73.86 | 66.85 | 90.67 | 77.78 | 71.70 | 75.00 | 42.16 | 51.13 | 60.29 | 72.15 | +| internlm2-chat-20b-hf | 74.07 | 85.96 | 75.57 | 77.53 | 89.12 | 76.85 | 72.64 | 83.72 | 51.96 | 56.11 | 68.42 | 73.49 | +| internlm2-chat-20b-sft-hf | 73.70 | 85.46 | 76.70 | 78.09 | 89.64 | 76.85 | 72.17 | 84.88 | 50.00 | 56.56 | 66.99 | 75.17 | +| llama-3-8b-instruct-hf | 55.93 | 67.42 | 55.68 | 55.06 | 72.02 | 62.04 | 54.25 | 66.86 | 44.12 | 40.72 | 47.37 | 44.63 | +| llama-3-70b-instruct-hf | 71.11 | 84.21 | 74.43 | 73.03 | 84.97 | 80.56 | 69.81 | 78.49 | 57.35 | 50.68 | 57.89 | 64.43 | +| llama-3-8b-instruct-lmdeploy | 54.81 | 67.17 | 58.52 | 53.37 | 72.54 | 62.04 | 57.08 | 63.95 | 44.12 | 37.56 | 46.89 | 42.62 | +| llama-3-70b-instruct-lmdeploy | 70.37 | 82.96 | 72.16 | 71.91 | 83.94 | 82.41 | 69.34 | 77.91 | 55.39 | 50.68 | 56.46 | 64.09 | +| mistral-7b-instruct-v0.1-hf | 39.63 | 46.62 | 33.52 | 41.01 | 56.48 | 45.37 | 36.32 | 43.60 | 29.90 | 31.67 | 39.71 | 31.88 | +| mistral-7b-instruct-v0.2-hf | 46.30 | 54.39 | 39.20 | 43.26 | 61.66 | 51.85 | 35.38 | 55.23 | 28.92 | 35.29 | 37.80 | 29.19 | +| mixtral-8x7b-instruct-v0.1-hf | 58.52 | 66.17 | 56.82 | 57.30 | 66.32 | 62.04 | 48.11 | 66.28 | 41.67 | 37.10 | 46.41 | 35.91 | + +| model | professional_tour_guide | legal_professional | high_school_chinese | high_school_history | middle_school_history | civil_servant | sports_science | plant_protection | basic_medicine | clinical_medicine | urban_and_rural_planner | accountant | +|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:| +| qwen1.5-0.5b-chat-hf | 36.47 | 39.07 | 27.53 | 41.76 | 45.89 | 39.63 | 35.56 | 31.66 | 37.71 | 34.00 | 32.78 | 37.25 | +| qwen1.5-1.8b-chat-hf | 56.02 | 45.58 | 39.33 | 67.03 | 84.54 | 49.42 | 48.89 | 51.76 | 47.43 | 50.50 | 45.69 | 52.14 | +| qwen1.5-4b-chat-hf | 61.28 | 52.56 | 42.70 | 73.08 | 85.99 | 55.48 | 59.44 | 55.28 | 60.57 | 57.00 | 50.00 | 58.01 | +| qwen1.5-7b-chat-hf | 73.31 | 56.28 | 58.99 | 82.97 | 88.41 | 64.57 | 66.67 | 63.82 | 77.14 | 75.50 | 57.42 | 69.07 | +| qwen1.5-14b-chat-hf | 80.83 | 65.12 | 70.79 | 89.56 | 93.24 | 67.60 | 72.78 | 68.34 | 80.57 | 80.00 | 61.72 | 75.62 | +| qwen1.5-32b-chat-hf | 87.59 | 72.56 | 76.40 | 90.66 | 95.65 | 74.36 | 80.00 | 80.40 | 86.86 | 84.00 | 74.88 | 85.33 | +| qwen1.5-72b-chat-hf | 90.98 | 76.28 | 75.84 | 90.66 | 95.65 | 75.52 | 84.44 | 82.91 | 91.43 | 89.00 | 73.92 | 85.10 | +| qwen1.5-110b-chat-hf | 95.11 | 88.37 | 82.58 | 91.76 | 96.62 | 87.65 | 91.67 | 90.95 | 93.71 | 95.00 | 87.08 | 91.87 | +| internlm2-chat-1.8b-hf | 54.14 | 40.00 | 27.53 | 62.09 | 70.53 | 44.99 | 41.67 | 51.76 | 45.71 | 39.00 | 40.67 | 39.28 | +| internlm2-chat-1.8b-sft-hf | 54.14 | 42.33 | 26.97 | 61.54 | 71.98 | 45.45 | 41.67 | 50.25 | 45.14 | 37.50 | 41.39 | 40.63 | +| internlm2-chat-7b-hf | 70.68 | 44.19 | 34.83 | 73.63 | 84.06 | 51.98 | 57.22 | 68.34 | 66.86 | 57.50 | 54.55 | 50.11 | +| internlm2-chat-7b-sft-hf | 71.80 | 44.65 | 37.64 | 73.63 | 84.06 | 51.98 | 57.78 | 67.84 | 65.71 | 60.50 | 54.55 | 50.11 | +| internlm2-chat-20b-hf | 75.56 | 54.42 | 42.13 | 74.73 | 85.51 | 57.34 | 65.56 | 67.84 | 73.71 | 64.00 | 57.89 | 55.98 | +| internlm2-chat-20b-sft-hf | 76.32 | 55.35 | 41.01 | 75.27 | 85.51 | 58.28 | 65.56 | 67.34 | 72.57 | 65.00 | 58.37 | 56.43 | +| llama-3-8b-instruct-hf | 53.01 | 44.65 | 33.15 | 46.70 | 66.18 | 45.22 | 58.89 | 61.81 | 62.86 | 57.50 | 48.33 | 49.89 | +| llama-3-70b-instruct-hf | 71.43 | 50.70 | 30.90 | 71.43 | 82.13 | 59.67 | 73.33 | 73.37 | 82.86 | 82.00 | 59.09 | 62.08 | +| llama-3-8b-instruct-lmdeploy | 51.13 | 45.12 | 29.78 | 43.96 | 62.32 | 47.09 | 56.11 | 54.77 | 56.00 | 56.00 | 49.04 | 47.40 | +| llama-3-70b-instruct-lmdeploy | 68.80 | 48.84 | 30.90 | 70.88 | 81.64 | 58.28 | 72.22 | 70.85 | 80.00 | 81.00 | 57.66 | 62.53 | +| mistral-7b-instruct-v0.1-hf | 30.45 | 35.81 | 24.72 | 40.11 | 34.78 | 30.77 | 43.89 | 38.69 | 36.57 | 32.50 | 44.74 | 34.09 | +| mistral-7b-instruct-v0.2-hf | 36.09 | 38.14 | 23.03 | 43.41 | 45.41 | 35.90 | 50.00 | 41.71 | 42.86 | 36.00 | 45.22 | 42.21 | +| mixtral-8x7b-instruct-v0.1-hf | 47.37 | 44.65 | 30.34 | 51.65 | 60.87 | 42.19 | 53.89 | 58.29 | 52.00 | 47.00 | 48.56 | 44.02 | + +| model | fire_engineer | environmental_impact_assessment_engineer | tax_accountant | physician | +|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:| +| qwen1.5-0.5b-chat-hf | 27.66 | 38.43 | 32.28 | 35.44 | +| qwen1.5-1.8b-chat-hf | 38.65 | 46.62 | 46.73 | 59.14 | +| qwen1.5-4b-chat-hf | 49.29 | 54.80 | 51.02 | 70.20 | +| qwen1.5-7b-chat-hf | 53.90 | 62.28 | 57.79 | 76.52 | +| qwen1.5-14b-chat-hf | 58.87 | 65.12 | 67.27 | 86.68 | +| qwen1.5-32b-chat-hf | 74.11 | 70.82 | 74.94 | 88.04 | +| qwen1.5-72b-chat-hf | 74.82 | 75.09 | 78.56 | 89.39 | +| qwen1.5-110b-chat-hf | 88.30 | 88.97 | 94.13 | 95.49 | +| internlm2-chat-1.8b-hf | 30.14 | 41.99 | 34.54 | 46.73 | +| internlm2-chat-1.8b-sft-hf | 30.14 | 43.06 | 34.31 | 47.86 | +| internlm2-chat-7b-hf | 42.20 | 52.31 | 47.63 | 66.82 | +| internlm2-chat-7b-sft-hf | 43.26 | 52.67 | 47.86 | 66.59 | +| internlm2-chat-20b-hf | 45.74 | 54.80 | 51.02 | 69.07 | +| internlm2-chat-20b-sft-hf | 45.74 | 55.16 | 51.02 | 68.62 | +| llama-3-8b-instruct-hf | 37.59 | 50.53 | 42.44 | 68.40 | +| llama-3-70b-instruct-hf | 50.71 | 64.06 | 55.53 | 84.42 | +| llama-3-8b-instruct-lmdeploy | 37.94 | 50.53 | 41.53 | 66.14 | +| llama-3-70b-instruct-lmdeploy | 48.94 | 63.70 | 53.95 | 81.72 | +| mistral-7b-instruct-v0.1-hf | 27.66 | 39.15 | 29.35 | 39.95 | +| mistral-7b-instruct-v0.2-hf | 32.27 | 37.01 | 32.96 | 42.89 | +| mixtral-8x7b-instruct-v0.1-hf | 36.88 | 48.75 | 41.76 | 53.05 | + +### Details on Dev Split diff --git a/configs/datasets/gpqa/README.md b/configs/datasets/gpqa/README.md new file mode 100644 index 000000000..05f4aeb9b --- /dev/null +++ b/configs/datasets/gpqa/README.md @@ -0,0 +1,64 @@ +# GPQA + +## Base Models + +| model | GPQA_diamond | +|:------------------------:|---------------:| +| llama-7b-turbomind | 24.24 | +| llama-13b-turbomind | 25.25 | +| llama-30b-turbomind | 22.73 | +| llama-65b-turbomind | 21.72 | +| llama-2-7b-turbomind | 25.25 | +| llama-2-13b-turbomind | 23.74 | +| llama-2-70b-turbomind | 28.28 | +| llama-3-8b-turbomind | 31.82 | +| llama-3-70b-turbomind | 40.91 | +| internlm2-1.8b-turbomind | 24.24 | +| internlm2-7b-turbomind | 28.28 | +| internlm2-20b-turbomind | 31.31 | +| qwen-1.8b-turbomind | 28.79 | +| qwen-7b-turbomind | 24.75 | +| qwen-14b-turbomind | 27.78 | +| qwen-72b-turbomind | 31.31 | +| qwen1.5-0.5b-hf | 23.74 | +| qwen1.5-1.8b-hf | 28.79 | +| qwen1.5-4b-hf | 23.23 | +| qwen1.5-7b-hf | 20.71 | +| qwen1.5-14b-hf | 32.32 | +| qwen1.5-32b-hf | 30.81 | +| qwen1.5-72b-hf | 31.82 | +| qwen1.5-moe-a2-7b-hf | 28.79 | +| mistral-7b-v0.1-hf | 24.75 | +| mistral-7b-v0.2-hf | 23.74 | +| mixtral-8x7b-v0.1-hf | 28.79 | +| mixtral-8x22b-v0.1-hf | 36.36 | +| yi-6b-hf | 28.28 | +| yi-34b-hf | 35.86 | +| deepseek-7b-base-hf | 20.71 | +| deepseek-67b-base-hf | 25.25 | + +## Chat Models + +| model | GPQA_diamond | +|:-----------------------------:|---------------:| +| qwen1.5-0.5b-chat-hf | 19.70 | +| qwen1.5-1.8b-chat-hf | 29.80 | +| qwen1.5-4b-chat-hf | 25.25 | +| qwen1.5-7b-chat-hf | 31.82 | +| qwen1.5-14b-chat-hf | 30.30 | +| qwen1.5-32b-chat-hf | 31.31 | +| qwen1.5-72b-chat-hf | 32.83 | +| qwen1.5-110b-chat-hf | 35.86 | +| internlm2-chat-1.8b-hf | 25.76 | +| internlm2-chat-1.8b-sft-hf | 26.26 | +| internlm2-chat-7b-hf | 28.28 | +| internlm2-chat-7b-sft-hf | 27.27 | +| internlm2-chat-20b-hf | 30.30 | +| internlm2-chat-20b-sft-hf | 29.29 | +| llama-3-8b-instruct-hf | 25.76 | +| llama-3-70b-instruct-hf | 37.88 | +| llama-3-8b-instruct-lmdeploy | 25.76 | +| llama-3-70b-instruct-lmdeploy | 37.88 | +| mistral-7b-instruct-v0.1-hf | 30.30 | +| mistral-7b-instruct-v0.2-hf | 25.25 | +| mixtral-8x7b-instruct-v0.1-hf | 30.30 | diff --git a/configs/datasets/gsm8k/README.md b/configs/datasets/gsm8k/README.md new file mode 100644 index 000000000..c91910de6 --- /dev/null +++ b/configs/datasets/gsm8k/README.md @@ -0,0 +1,64 @@ +# GSM8K + +## Base Models + +| model | gsm8k | +|:------------------------:|--------:| +| llama-7b-turbomind | 10.31 | +| llama-13b-turbomind | 20.55 | +| llama-30b-turbomind | 42.08 | +| llama-65b-turbomind | 54.81 | +| llama-2-7b-turbomind | 16.76 | +| llama-2-13b-turbomind | 29.87 | +| llama-2-70b-turbomind | 63.53 | +| llama-3-8b-turbomind | 54.28 | +| llama-3-70b-turbomind | 69.98 | +| internlm2-1.8b-turbomind | 30.40 | +| internlm2-7b-turbomind | 69.98 | +| internlm2-20b-turbomind | 76.80 | +| qwen-1.8b-turbomind | 23.73 | +| qwen-7b-turbomind | 54.36 | +| qwen-14b-turbomind | 61.64 | +| qwen-72b-turbomind | 79.68 | +| qwen1.5-0.5b-hf | 13.27 | +| qwen1.5-1.8b-hf | 34.87 | +| qwen1.5-4b-hf | 47.61 | +| qwen1.5-7b-hf | 54.36 | +| qwen1.5-14b-hf | 63.53 | +| qwen1.5-32b-hf | 72.71 | +| qwen1.5-72b-hf | 79.53 | +| qwen1.5-moe-a2-7b-hf | 61.26 | +| mistral-7b-v0.1-hf | 47.61 | +| mistral-7b-v0.2-hf | 45.19 | +| mixtral-8x7b-v0.1-hf | 66.26 | +| mixtral-8x22b-v0.1-hf | 82.87 | +| yi-6b-hf | 39.58 | +| yi-34b-hf | 67.70 | +| deepseek-7b-base-hf | 20.17 | +| deepseek-67b-base-hf | 68.16 | + +## Chat Models + +| model | gsm8k | +|:-----------------------------:|--------:| +| qwen1.5-0.5b-chat-hf | 8.79 | +| qwen1.5-1.8b-chat-hf | 27.60 | +| qwen1.5-4b-chat-hf | 47.61 | +| qwen1.5-7b-chat-hf | 56.25 | +| qwen1.5-14b-chat-hf | 64.90 | +| qwen1.5-32b-chat-hf | 79.91 | +| qwen1.5-72b-chat-hf | 77.03 | +| qwen1.5-110b-chat-hf | 79.53 | +| internlm2-chat-1.8b-hf | 39.73 | +| internlm2-chat-1.8b-sft-hf | 36.85 | +| internlm2-chat-7b-hf | 69.90 | +| internlm2-chat-7b-sft-hf | 69.83 | +| internlm2-chat-20b-hf | 75.21 | +| internlm2-chat-20b-sft-hf | 76.95 | +| llama-3-8b-instruct-hf | 79.53 | +| llama-3-70b-instruct-hf | 89.76 | +| llama-3-8b-instruct-lmdeploy | 78.77 | +| llama-3-70b-instruct-lmdeploy | 89.31 | +| mistral-7b-instruct-v0.1-hf | 42.23 | +| mistral-7b-instruct-v0.2-hf | 45.56 | +| mixtral-8x7b-instruct-v0.1-hf | 65.13 | diff --git a/configs/datasets/hellaswag/README.md b/configs/datasets/hellaswag/README.md new file mode 100644 index 000000000..dd62abc7c --- /dev/null +++ b/configs/datasets/hellaswag/README.md @@ -0,0 +1,64 @@ +# HellaSwag + +## Base Models + +| model | hellaswag | +|:------------------------:|------------:| +| llama-7b-turbomind | 26.99 | +| llama-13b-turbomind | 34.21 | +| llama-30b-turbomind | 35.65 | +| llama-65b-turbomind | 44.63 | +| llama-2-7b-turbomind | 29.29 | +| llama-2-13b-turbomind | 45.06 | +| llama-2-70b-turbomind | 55.91 | +| llama-3-8b-turbomind | 50.86 | +| llama-3-70b-turbomind | 80.60 | +| internlm2-1.8b-turbomind | 44.86 | +| internlm2-7b-turbomind | 89.52 | +| internlm2-20b-turbomind | 91.41 | +| qwen-1.8b-turbomind | 38.04 | +| qwen-7b-turbomind | 64.62 | +| qwen-14b-turbomind | 85.88 | +| qwen-72b-turbomind | 90.40 | +| qwen1.5-0.5b-hf | 29.19 | +| qwen1.5-1.8b-hf | 42.32 | +| qwen1.5-4b-hf | 55.89 | +| qwen1.5-7b-hf | 68.51 | +| qwen1.5-14b-hf | 83.86 | +| qwen1.5-32b-hf | 87.28 | +| qwen1.5-72b-hf | 90.41 | +| qwen1.5-moe-a2-7b-hf | 72.42 | +| mistral-7b-v0.1-hf | 42.04 | +| mistral-7b-v0.2-hf | 46.24 | +| mixtral-8x7b-v0.1-hf | 66.22 | +| mixtral-8x22b-v0.1-hf | 79.66 | +| yi-6b-hf | 66.83 | +| yi-34b-hf | 83.83 | +| deepseek-7b-base-hf | 30.42 | +| deepseek-67b-base-hf | 70.75 | + +## Chat Models + +| model | hellaswag | +|:-----------------------------:|------------:| +| qwen1.5-0.5b-chat-hf | 29.60 | +| qwen1.5-1.8b-chat-hf | 41.71 | +| qwen1.5-4b-chat-hf | 60.45 | +| qwen1.5-7b-chat-hf | 71.58 | +| qwen1.5-14b-chat-hf | 79.70 | +| qwen1.5-32b-chat-hf | 88.56 | +| qwen1.5-72b-chat-hf | 89.37 | +| qwen1.5-110b-chat-hf | 91.11 | +| internlm2-chat-1.8b-hf | 60.47 | +| internlm2-chat-1.8b-sft-hf | 61.58 | +| internlm2-chat-7b-hf | 84.80 | +| internlm2-chat-7b-sft-hf | 85.21 | +| internlm2-chat-20b-hf | 88.48 | +| internlm2-chat-20b-sft-hf | 88.95 | +| llama-3-8b-instruct-hf | 74.39 | +| llama-3-70b-instruct-hf | 89.07 | +| llama-3-8b-instruct-lmdeploy | 73.31 | +| llama-3-70b-instruct-lmdeploy | 87.28 | +| mistral-7b-instruct-v0.1-hf | 53.00 | +| mistral-7b-instruct-v0.2-hf | 65.72 | +| mixtral-8x7b-instruct-v0.1-hf | 76.16 | diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md new file mode 100644 index 000000000..4637e8a0a --- /dev/null +++ b/configs/datasets/humaneval/README.md @@ -0,0 +1,64 @@ +# HumanEval + +## Base Models + +| model | pass@1 | +|:------------------------:|---------:| +| llama-7b-turbomind | 12.80 | +| llama-13b-turbomind | 15.24 | +| llama-30b-turbomind | 9.15 | +| llama-65b-turbomind | 7.32 | +| llama-2-7b-turbomind | 14.02 | +| llama-2-13b-turbomind | 15.24 | +| llama-2-70b-turbomind | 15.24 | +| llama-3-8b-turbomind | 28.05 | +| llama-3-70b-turbomind | 28.05 | +| internlm2-1.8b-turbomind | 30.49 | +| internlm2-7b-turbomind | 48.17 | +| internlm2-20b-turbomind | 51.83 | +| qwen-1.8b-turbomind | 16.46 | +| qwen-7b-turbomind | 23.78 | +| qwen-14b-turbomind | 23.78 | +| qwen-72b-turbomind | 66.46 | +| qwen1.5-0.5b-hf | 8.54 | +| qwen1.5-1.8b-hf | 23.17 | +| qwen1.5-4b-hf | 41.46 | +| qwen1.5-7b-hf | 53.05 | +| qwen1.5-14b-hf | 57.32 | +| qwen1.5-32b-hf | 70.12 | +| qwen1.5-72b-hf | 65.85 | +| qwen1.5-moe-a2-7b-hf | 45.73 | +| mistral-7b-v0.1-hf | 14.02 | +| mistral-7b-v0.2-hf | 9.15 | +| mixtral-8x7b-v0.1-hf | 24.39 | +| mixtral-8x22b-v0.1-hf | 16.46 | +| yi-6b-hf | 14.63 | +| yi-34b-hf | 17.07 | +| deepseek-7b-base-hf | 18.29 | +| deepseek-67b-base-hf | 23.17 | + +## Chat Models + +| model | pass@1 | +|:-----------------------------:|---------:| +| qwen1.5-0.5b-chat-hf | 9.15 | +| qwen1.5-1.8b-chat-hf | 15.85 | +| qwen1.5-4b-chat-hf | 30.49 | +| qwen1.5-7b-chat-hf | 40.85 | +| qwen1.5-14b-chat-hf | 50.00 | +| qwen1.5-32b-chat-hf | 57.93 | +| qwen1.5-72b-chat-hf | 60.37 | +| qwen1.5-110b-chat-hf | 65.24 | +| internlm2-chat-1.8b-hf | 33.54 | +| internlm2-chat-1.8b-sft-hf | 34.15 | +| internlm2-chat-7b-hf | 56.71 | +| internlm2-chat-7b-sft-hf | 61.59 | +| internlm2-chat-20b-hf | 67.68 | +| internlm2-chat-20b-sft-hf | 67.68 | +| llama-3-8b-instruct-hf | 55.49 | +| llama-3-70b-instruct-hf | 70.73 | +| llama-3-8b-instruct-lmdeploy | 57.93 | +| llama-3-70b-instruct-lmdeploy | 70.73 | +| mistral-7b-instruct-v0.1-hf | 32.32 | +| mistral-7b-instruct-v0.2-hf | 29.27 | +| mixtral-8x7b-instruct-v0.1-hf | 34.15 | diff --git a/configs/datasets/math/README.md b/configs/datasets/math/README.md new file mode 100644 index 000000000..a11a39083 --- /dev/null +++ b/configs/datasets/math/README.md @@ -0,0 +1,64 @@ +# MATH + +## Base Models + +| model | math | +|:------------------------:|-------:| +| llama-7b-turbomind | 2.94 | +| llama-13b-turbomind | 3.84 | +| llama-30b-turbomind | 6.54 | +| llama-65b-turbomind | 10.66 | +| llama-2-7b-turbomind | 3.58 | +| llama-2-13b-turbomind | 5.30 | +| llama-2-70b-turbomind | 13.26 | +| llama-3-8b-turbomind | 16.42 | +| llama-3-70b-turbomind | 39.64 | +| internlm2-1.8b-turbomind | 9.42 | +| internlm2-7b-turbomind | 25.16 | +| internlm2-20b-turbomind | 32.24 | +| qwen-1.8b-turbomind | 6.30 | +| qwen-7b-turbomind | 15.56 | +| qwen-14b-turbomind | 30.38 | +| qwen-72b-turbomind | 44.18 | +| qwen1.5-0.5b-hf | 4.16 | +| qwen1.5-1.8b-hf | 11.32 | +| qwen1.5-4b-hf | 17.50 | +| qwen1.5-7b-hf | 17.34 | +| qwen1.5-14b-hf | 36.18 | +| qwen1.5-32b-hf | 45.74 | +| qwen1.5-72b-hf | 41.56 | +| qwen1.5-moe-a2-7b-hf | 27.96 | +| mistral-7b-v0.1-hf | 13.44 | +| mistral-7b-v0.2-hf | 12.74 | +| mixtral-8x7b-v0.1-hf | 29.46 | +| mixtral-8x22b-v0.1-hf | 41.82 | +| yi-6b-hf | 6.60 | +| yi-34b-hf | 18.80 | +| deepseek-7b-base-hf | 4.66 | +| deepseek-67b-base-hf | 18.76 | + +## Chat Models + +| model | math | +|:-----------------------------:|-------:| +| qwen1.5-0.5b-chat-hf | 0.56 | +| qwen1.5-1.8b-chat-hf | 4.94 | +| qwen1.5-4b-chat-hf | 7.34 | +| qwen1.5-7b-chat-hf | 22.14 | +| qwen1.5-14b-chat-hf | 32.22 | +| qwen1.5-32b-chat-hf | 41.80 | +| qwen1.5-72b-chat-hf | 45.22 | +| qwen1.5-110b-chat-hf | 54.38 | +| internlm2-chat-1.8b-hf | 14.06 | +| internlm2-chat-1.8b-sft-hf | 13.10 | +| internlm2-chat-7b-hf | 28.08 | +| internlm2-chat-7b-sft-hf | 27.60 | +| internlm2-chat-20b-hf | 34.68 | +| internlm2-chat-20b-sft-hf | 32.54 | +| llama-3-8b-instruct-hf | 27.50 | +| llama-3-70b-instruct-hf | 47.52 | +| llama-3-8b-instruct-lmdeploy | 27.42 | +| llama-3-70b-instruct-lmdeploy | 46.90 | +| mistral-7b-instruct-v0.1-hf | 8.48 | +| mistral-7b-instruct-v0.2-hf | 10.82 | +| mixtral-8x7b-instruct-v0.1-hf | 27.02 | diff --git a/configs/datasets/mbpp/README.md b/configs/datasets/mbpp/README.md new file mode 100644 index 000000000..3d0d84830 --- /dev/null +++ b/configs/datasets/mbpp/README.md @@ -0,0 +1,64 @@ +# MBPP + +## Base Models + +| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | +|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| +| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | +| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | +| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | +| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | +| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | +| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | +| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | +| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | +| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | +| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | +| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | +| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | +| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | +| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | +| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | +| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | +| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | +| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | +| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | +| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | +| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | +| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | +| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | +| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | +| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | +| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | +| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | +| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | +| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | +| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | +| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | +| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | + +## Chat Models + +| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | +|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| +| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | +| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | +| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | +| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | +| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | +| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | +| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | +| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | +| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | +| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | +| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | +| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | +| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | +| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | +| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | +| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | +| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | +| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | +| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | +| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | +| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | diff --git a/configs/datasets/mmlu/README.md b/configs/datasets/mmlu/README.md new file mode 100644 index 000000000..b8e4a9afd --- /dev/null +++ b/configs/datasets/mmlu/README.md @@ -0,0 +1,363 @@ +# MMLU + +## Base Models + +| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | +|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:| +| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 | +| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 | +| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 | +| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 | +| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 | +| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 | +| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 | +| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 | +| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 | +| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 | +| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 | +| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 | +| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 | +| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 | +| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 | +| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 | +| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 | +| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 | +| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 | +| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 | +| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 | +| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 | +| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 | +| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 | +| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 | +| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 | +| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 | +| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 | +| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 | +| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 | +| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 | +| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 | + +### Details + +| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts | +|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:| +| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 | +| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 | +| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 | +| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 | +| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 | +| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 | +| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 | +| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 | +| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 | +| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 | +| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 | +| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 | +| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 | +| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 | +| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 | +| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 | +| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 | +| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 | +| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 | +| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 | +| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 | +| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 | +| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 | +| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 | +| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 | +| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 | +| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 | +| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 | +| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 | +| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 | +| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 | +| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 | + +| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology | +|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:| +| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 | +| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 | +| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 | +| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 | +| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 | +| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 | +| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 | +| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 | +| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 | +| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 | +| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 | +| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 | +| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 | +| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 | +| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 | +| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 | +| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 | +| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 | +| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 | +| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 | +| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 | +| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 | +| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 | +| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 | +| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 | +| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 | +| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 | +| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 | +| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 | +| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 | +| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 | +| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 | + +| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history | +|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:| +| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 | +| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 | +| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 | +| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 | +| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 | +| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 | +| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 | +| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 | +| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 | +| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 | +| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 | +| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 | +| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 | +| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 | +| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 | +| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 | +| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 | +| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 | +| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 | +| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 | +| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 | +| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 | +| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 | +| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 | +| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 | +| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 | +| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 | +| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 | +| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 | +| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 | +| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 | +| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 | + +| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine | +|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:| +| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 | +| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 | +| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 | +| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 | +| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 | +| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 | +| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 | +| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 | +| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 | +| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 | +| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 | +| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 | +| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 | +| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 | +| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 | +| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 | +| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 | +| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 | +| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 | +| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 | +| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 | +| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 | +| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 | +| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 | +| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 | +| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 | +| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 | +| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 | +| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 | +| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 | +| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 | +| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 | + +| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy | +|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:| +| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 | +| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 | +| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 | +| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 | +| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 | +| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 | +| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 | +| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 | +| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 | +| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 | +| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 | +| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 | +| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 | +| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 | +| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 | +| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 | +| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 | +| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 | +| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 | +| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 | +| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 | +| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 | +| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 | +| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 | +| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 | +| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 | +| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 | +| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 | +| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 | +| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 | +| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 | +| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 | + +## Chat Models + +| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | +|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:| +| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 | +| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 | +| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 | +| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 | +| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 | +| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 | +| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 | +| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 | +| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 | +| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 | +| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 | +| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 | +| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 | +| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 | +| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 | +| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 | +| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 | +| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 | +| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 | +| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 | +| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 | + +### Details + +| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts | +|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:| +| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 | +| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 | +| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 | +| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 | +| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 | +| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 | +| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 | +| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 | +| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 | +| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 | +| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 | +| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 | +| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 | +| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 | +| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 | +| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 | +| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 | +| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 | +| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 | +| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 | +| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 | + +| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology | +|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:| +| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 | +| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 | +| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 | +| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 | +| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 | +| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 | +| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 | +| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 | +| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 | +| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 | +| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 | +| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 | +| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 | +| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 | +| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 | +| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 | +| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 | +| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 | +| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 | +| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 | +| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 | + +| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history | +|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:| +| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 | +| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 | +| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 | +| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 | +| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 | +| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 | +| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 | +| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 | +| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 | +| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 | +| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 | +| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 | +| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 | +| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 | +| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 | +| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 | +| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 | +| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 | +| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 | +| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 | +| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 | + +| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine | +|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:| +| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 | +| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 | +| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 | +| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 | +| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 | +| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 | +| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 | +| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 | +| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 | +| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 | +| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 | +| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 | +| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 | +| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 | +| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 | +| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 | +| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 | +| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 | +| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 | +| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 | +| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 | + +| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy | +|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:| +| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 | +| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 | +| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 | +| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 | +| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 | +| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 | +| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 | +| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 | +| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 | +| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 | +| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 | +| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 | +| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 | +| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 | +| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 | +| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 | +| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 | +| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 | +| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 | +| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 | +| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 | diff --git a/configs/datasets/nq/README.md b/configs/datasets/nq/README.md new file mode 100644 index 000000000..3c795f2ff --- /dev/null +++ b/configs/datasets/nq/README.md @@ -0,0 +1,64 @@ +# NQ + +## Base Models + +| model | nq | +|:------------------------:|------:| +| llama-7b-turbomind | 15.10 | +| llama-13b-turbomind | 16.43 | +| llama-30b-turbomind | 22.11 | +| llama-65b-turbomind | 26.09 | +| llama-2-7b-turbomind | 14.35 | +| llama-2-13b-turbomind | 21.69 | +| llama-2-70b-turbomind | 23.27 | +| llama-3-8b-turbomind | 18.78 | +| llama-3-70b-turbomind | 31.88 | +| internlm2-1.8b-turbomind | 20.66 | +| internlm2-7b-turbomind | 41.05 | +| internlm2-20b-turbomind | 43.55 | +| qwen-1.8b-turbomind | 5.68 | +| qwen-7b-turbomind | 17.87 | +| qwen-14b-turbomind | 13.77 | +| qwen-72b-turbomind | 18.20 | +| qwen1.5-0.5b-hf | 6.01 | +| qwen1.5-1.8b-hf | 10.28 | +| qwen1.5-4b-hf | 15.73 | +| qwen1.5-7b-hf | 18.61 | +| qwen1.5-14b-hf | 16.07 | +| qwen1.5-32b-hf | 21.75 | +| qwen1.5-72b-hf | 20.53 | +| qwen1.5-moe-a2-7b-hf | 16.62 | +| mistral-7b-v0.1-hf | 20.66 | +| mistral-7b-v0.2-hf | 20.78 | +| mixtral-8x7b-v0.1-hf | 24.85 | +| mixtral-8x22b-v0.1-hf | 34.43 | +| yi-6b-hf | 10.08 | +| yi-34b-hf | 13.96 | +| deepseek-7b-base-hf | 8.45 | +| deepseek-67b-base-hf | 17.59 | + +## Chat Models + +| model | nq | +|:-----------------------------:|------:| +| qwen1.5-0.5b-chat-hf | 7.42 | +| qwen1.5-1.8b-chat-hf | 10.22 | +| qwen1.5-4b-chat-hf | 19.31 | +| qwen1.5-7b-chat-hf | 16.87 | +| qwen1.5-14b-chat-hf | 20.53 | +| qwen1.5-32b-chat-hf | 25.26 | +| qwen1.5-72b-chat-hf | 35.21 | +| qwen1.5-110b-chat-hf | 36.98 | +| internlm2-chat-1.8b-hf | 19.09 | +| internlm2-chat-1.8b-sft-hf | 18.14 | +| internlm2-chat-7b-hf | 28.73 | +| internlm2-chat-7b-sft-hf | 30.78 | +| internlm2-chat-20b-hf | 28.75 | +| internlm2-chat-20b-sft-hf | 34.10 | +| llama-3-8b-instruct-hf | 30.17 | +| llama-3-70b-instruct-hf | 40.25 | +| llama-3-8b-instruct-lmdeploy | 28.28 | +| llama-3-70b-instruct-lmdeploy | 39.14 | +| mistral-7b-instruct-v0.1-hf | 22.47 | +| mistral-7b-instruct-v0.2-hf | 25.18 | +| mixtral-8x7b-instruct-v0.1-hf | 32.08 | diff --git a/configs/datasets/race/README.md b/configs/datasets/race/README.md new file mode 100644 index 000000000..f1c55ba76 --- /dev/null +++ b/configs/datasets/race/README.md @@ -0,0 +1,64 @@ +# RACE + +## Base Models + +| model | race-high | race-middle | +|:------------------------:|------------:|--------------:| +| llama-7b-turbomind | 31.30 | 29.53 | +| llama-13b-turbomind | 35.56 | 40.25 | +| llama-30b-turbomind | 57.35 | 55.78 | +| llama-65b-turbomind | 70.21 | 75.35 | +| llama-2-7b-turbomind | 39.74 | 46.73 | +| llama-2-13b-turbomind | 57.06 | 60.52 | +| llama-2-70b-turbomind | 79.02 | 82.17 | +| llama-3-8b-turbomind | 67.75 | 73.61 | +| llama-3-70b-turbomind | 85.79 | 90.25 | +| internlm2-1.8b-turbomind | 64.72 | 70.40 | +| internlm2-7b-turbomind | 72.56 | 74.16 | +| internlm2-20b-turbomind | 72.90 | 74.03 | +| qwen-1.8b-turbomind | 63.09 | 69.29 | +| qwen-7b-turbomind | 80.30 | 85.38 | +| qwen-14b-turbomind | 88.11 | 92.06 | +| qwen-72b-turbomind | 90.62 | 93.59 | +| qwen1.5-0.5b-hf | 54.66 | 60.38 | +| qwen1.5-1.8b-hf | 67.27 | 73.33 | +| qwen1.5-4b-hf | 78.50 | 83.29 | +| qwen1.5-7b-hf | 82.73 | 86.70 | +| qwen1.5-14b-hf | 87.99 | 91.85 | +| qwen1.5-32b-hf | 90.57 | 93.25 | +| qwen1.5-72b-hf | 90.45 | 93.87 | +| qwen1.5-moe-a2-7b-hf | 79.56 | 83.57 | +| mistral-7b-v0.1-hf | 73.58 | 76.25 | +| mistral-7b-v0.2-hf | 73.67 | 77.09 | +| mixtral-8x7b-v0.1-hf | 80.13 | 84.61 | +| mixtral-8x22b-v0.1-hf | 86.56 | 89.62 | +| yi-6b-hf | 82.93 | 85.72 | +| yi-34b-hf | 90.94 | 92.76 | +| deepseek-7b-base-hf | 50.91 | 56.82 | +| deepseek-67b-base-hf | 83.53 | 88.23 | + +## Chat Models + +| model | race-high | race-middle | +|:-----------------------------:|------------:|--------------:| +| qwen1.5-0.5b-chat-hf | 49.03 | 52.79 | +| qwen1.5-1.8b-chat-hf | 66.24 | 72.91 | +| qwen1.5-4b-chat-hf | 73.53 | 80.29 | +| qwen1.5-7b-chat-hf | 83.28 | 88.09 | +| qwen1.5-14b-chat-hf | 87.51 | 91.36 | +| qwen1.5-32b-chat-hf | 91.22 | 93.52 | +| qwen1.5-72b-chat-hf | 91.11 | 93.38 | +| qwen1.5-110b-chat-hf | 92.31 | 93.66 | +| internlm2-chat-1.8b-hf | 73.87 | 81.13 | +| internlm2-chat-1.8b-sft-hf | 73.81 | 81.69 | +| internlm2-chat-7b-hf | 84.51 | 88.72 | +| internlm2-chat-7b-sft-hf | 84.88 | 89.90 | +| internlm2-chat-20b-hf | 88.02 | 91.43 | +| internlm2-chat-20b-sft-hf | 88.11 | 91.57 | +| llama-3-8b-instruct-hf | 81.22 | 86.63 | +| llama-3-70b-instruct-hf | 89.57 | 93.45 | +| llama-3-8b-instruct-lmdeploy | 81.02 | 86.14 | +| llama-3-70b-instruct-lmdeploy | 89.34 | 93.25 | +| mistral-7b-instruct-v0.1-hf | 69.75 | 74.72 | +| mistral-7b-instruct-v0.2-hf | 73.30 | 77.58 | +| mixtral-8x7b-instruct-v0.1-hf | 81.88 | 87.26 | diff --git a/configs/datasets/triviaqa/README.md b/configs/datasets/triviaqa/README.md new file mode 100644 index 000000000..f5e155a95 --- /dev/null +++ b/configs/datasets/triviaqa/README.md @@ -0,0 +1,64 @@ +# TriviaQA + +## Base Models + +| model | triviaqa | +|:------------------------:|-----------:| +| llama-7b-turbomind | 40.39 | +| llama-13b-turbomind | 66.41 | +| llama-30b-turbomind | 75.90 | +| llama-65b-turbomind | 82.26 | +| llama-2-7b-turbomind | 43.21 | +| llama-2-13b-turbomind | 71.32 | +| llama-2-70b-turbomind | 67.45 | +| llama-3-8b-turbomind | 71.24 | +| llama-3-70b-turbomind | 88.16 | +| internlm2-1.8b-turbomind | 38.42 | +| internlm2-7b-turbomind | 69.15 | +| internlm2-20b-turbomind | 74.03 | +| qwen-1.8b-turbomind | 22.76 | +| qwen-7b-turbomind | 53.61 | +| qwen-14b-turbomind | 49.72 | +| qwen-72b-turbomind | 79.13 | +| qwen1.5-0.5b-hf | 21.24 | +| qwen1.5-1.8b-hf | 34.32 | +| qwen1.5-4b-hf | 44.59 | +| qwen1.5-7b-hf | 56.60 | +| qwen1.5-14b-hf | 59.96 | +| qwen1.5-32b-hf | 56.20 | +| qwen1.5-72b-hf | 77.81 | +| qwen1.5-moe-a2-7b-hf | 65.49 | +| mistral-7b-v0.1-hf | 72.93 | +| mistral-7b-v0.2-hf | 70.91 | +| mixtral-8x7b-v0.1-hf | 85.05 | +| mixtral-8x22b-v0.1-hf | 89.47 | +| yi-6b-hf | 23.76 | +| yi-34b-hf | 14.73 | +| deepseek-7b-base-hf | 59.48 | +| deepseek-67b-base-hf | 72.15 | + +## Chat Models + +| model | triviaqa | +|:-----------------------------:|-----------:| +| qwen1.5-0.5b-chat-hf | 19.84 | +| qwen1.5-1.8b-chat-hf | 35.81 | +| qwen1.5-4b-chat-hf | 48.93 | +| qwen1.5-7b-chat-hf | 53.65 | +| qwen1.5-14b-chat-hf | 62.58 | +| qwen1.5-32b-chat-hf | 74.72 | +| qwen1.5-72b-chat-hf | 83.25 | +| qwen1.5-110b-chat-hf | 86.20 | +| internlm2-chat-1.8b-hf | 46.69 | +| internlm2-chat-1.8b-sft-hf | 46.50 | +| internlm2-chat-7b-hf | 69.54 | +| internlm2-chat-7b-sft-hf | 70.75 | +| internlm2-chat-20b-hf | 75.53 | +| internlm2-chat-20b-sft-hf | 75.90 | +| llama-3-8b-instruct-hf | 78.99 | +| llama-3-70b-instruct-hf | 89.79 | +| llama-3-8b-instruct-lmdeploy | 76.77 | +| llama-3-70b-instruct-lmdeploy | 89.62 | +| mistral-7b-instruct-v0.1-hf | 62.94 | +| mistral-7b-instruct-v0.2-hf | 67.72 | +| mixtral-8x7b-instruct-v0.1-hf | 79.57 | diff --git a/configs/datasets/winogrande/README.md b/configs/datasets/winogrande/README.md new file mode 100644 index 000000000..816732682 --- /dev/null +++ b/configs/datasets/winogrande/README.md @@ -0,0 +1,64 @@ +# WinoGrande + +## Base Models + +| model | winogrande | +|:------------------------:|-------------:| +| llama-7b-turbomind | 71.19 | +| llama-13b-turbomind | 76.16 | +| llama-30b-turbomind | 80.66 | +| llama-65b-turbomind | 82.16 | +| llama-2-7b-turbomind | 74.03 | +| llama-2-13b-turbomind | 76.48 | +| llama-2-70b-turbomind | 83.98 | +| llama-3-8b-turbomind | 77.82 | +| llama-3-70b-turbomind | 83.43 | +| internlm2-1.8b-turbomind | 66.77 | +| internlm2-7b-turbomind | 83.50 | +| internlm2-20b-turbomind | 84.69 | +| qwen-1.8b-turbomind | 61.25 | +| qwen-7b-turbomind | 72.06 | +| qwen-14b-turbomind | 72.45 | +| qwen-72b-turbomind | 82.56 | +| qwen1.5-0.5b-hf | 57.38 | +| qwen1.5-1.8b-hf | 60.46 | +| qwen1.5-4b-hf | 65.90 | +| qwen1.5-7b-hf | 70.01 | +| qwen1.5-14b-hf | 72.93 | +| qwen1.5-32b-hf | 78.69 | +| qwen1.5-72b-hf | 80.74 | +| qwen1.5-moe-a2-7b-hf | 71.43 | +| mistral-7b-v0.1-hf | 78.30 | +| mistral-7b-v0.2-hf | 77.51 | +| mixtral-8x7b-v0.1-hf | 81.53 | +| mixtral-8x22b-v0.1-hf | 86.50 | +| yi-6b-hf | 74.35 | +| yi-34b-hf | 79.01 | +| deepseek-7b-base-hf | 74.11 | +| deepseek-67b-base-hf | 79.32 | + +## Chat Models + +| model | winogrande | +|:-----------------------------:|-------------:| +| qwen1.5-0.5b-chat-hf | 50.51 | +| qwen1.5-1.8b-chat-hf | 51.07 | +| qwen1.5-4b-chat-hf | 57.54 | +| qwen1.5-7b-chat-hf | 65.27 | +| qwen1.5-14b-chat-hf | 70.09 | +| qwen1.5-32b-chat-hf | 77.90 | +| qwen1.5-72b-chat-hf | 80.82 | +| qwen1.5-110b-chat-hf | 82.32 | +| internlm2-chat-1.8b-hf | 57.62 | +| internlm2-chat-1.8b-sft-hf | 57.93 | +| internlm2-chat-7b-hf | 73.56 | +| internlm2-chat-7b-sft-hf | 73.80 | +| internlm2-chat-20b-hf | 81.06 | +| internlm2-chat-20b-sft-hf | 81.37 | +| llama-3-8b-instruct-hf | 66.22 | +| llama-3-70b-instruct-hf | 81.29 | +| llama-3-8b-instruct-lmdeploy | 66.93 | +| llama-3-70b-instruct-lmdeploy | 81.22 | +| mistral-7b-instruct-v0.1-hf | 58.56 | +| mistral-7b-instruct-v0.2-hf | 59.43 | +| mixtral-8x7b-instruct-v0.1-hf | 65.75 | diff --git a/configs/eval_charm.py b/configs/eval_charm.py index fb6f17002..c1b65ccc0 100644 --- a/configs/eval_charm.py +++ b/configs/eval_charm.py @@ -1,7 +1,7 @@ from mmengine.config import read_base with read_base(): - from .datasets.CHARM.charm_rea_gen_f8fca2 import charm_rea_datasets as datasets + from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model # from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model @@ -34,19 +34,19 @@ # dataset version metric mode internlm2-chat-7b-turbomind # ------------------------------------------------------------- --------- ------------- ------ ----------------------------- -# charm-rea-Direct - naive_average gen 49.51 -# charm-rea-ZH-CoT - naive_average gen 61.33 -# charm-rea-EN-CoT - naive_average gen 54.55 -# charm-rea-XLT - naive_average gen 58.46 -# charm-rea-Translate-EN - naive_average gen 56.15 +# charm-reason-Direct - naive_average gen 49.51 +# charm-reason-ZH-CoT - naive_average gen 61.33 +# charm-reason-EN-CoT - naive_average gen 54.55 +# charm-reason-XLT - naive_average gen 58.46 +# charm-reason-Translate-EN - naive_average gen 56.15 # - - - - -# charm-rea-Chinese_Direct - naive_average gen 47.14 -# charm-rea-Chinese_ZH-CoT - naive_average gen 58.40 -# charm-rea-Chinese_EN-CoT - naive_average gen 48.31 -# charm-rea-Chinese_XLT - naive_average gen 53.57 -# charm-rea-Chinese_Translate-EN - naive_average gen 48.21 -# charm-rea-Global_Direct - naive_average gen 51.88 -# charm-rea-Global_ZH-CoT - naive_average gen 64.26 -# charm-rea-Global_EN-CoT - naive_average gen 60.79 -# charm-rea-Global_XLT - naive_average gen 63.36 -# charm-rea-Global_Translate-EN - naive_average gen 64.10 +# charm-reason-Chinese_Direct - naive_average gen 47.14 +# charm-reason-Chinese_ZH-CoT - naive_average gen 58.40 +# charm-reason-Chinese_EN-CoT - naive_average gen 48.31 +# charm-reason-Chinese_XLT - naive_average gen 53.57 +# charm-reason-Chinese_Translate-EN - naive_average gen 48.21 +# charm-reason-Global_Direct - naive_average gen 51.88 +# charm-reason-Global_ZH-CoT - naive_average gen 64.26 +# charm-reason-Global_EN-CoT - naive_average gen 60.79 +# charm-reason-Global_XLT - naive_average gen 63.36 +# charm-reason-Global_Translate-EN - naive_average gen 64.10 diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py index e4a10df10..1b1a93d60 100644 --- a/configs/eval_subjective_alignbench.py +++ b/configs/eval_subjective_alignbench.py @@ -4,7 +4,6 @@ from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/eval_subjective_alpacaeval_oc.py b/configs/eval_subjective_alpacaeval_oc.py index bc5b04488..53c2f2459 100644 --- a/configs/eval_subjective_alpacaeval_oc.py +++ b/configs/eval_subjective_alpacaeval_oc.py @@ -5,7 +5,7 @@ from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 -from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN +from opencompass.models.openai_api import OpenAI from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/eval_subjective_alpacaeval_official.py b/configs/eval_subjective_alpacaeval_official.py index af9331b81..eac878cc7 100644 --- a/configs/eval_subjective_alpacaeval_official.py +++ b/configs/eval_subjective_alpacaeval_official.py @@ -4,7 +4,7 @@ from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 -from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN +from opencompass.models.openai_api import OpenAI from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py index e3d12218c..e39b883f3 100644 --- a/configs/eval_subjective_compassarena.py +++ b/configs/eval_subjective_compassarena.py @@ -6,7 +6,6 @@ from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/eval_subjective_creationbench.py b/configs/eval_subjective_creationbench.py index 922225f11..2346de52e 100644 --- a/configs/eval_subjective_creationbench.py +++ b/configs/eval_subjective_creationbench.py @@ -4,7 +4,6 @@ from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py index 64e55ae12..b371b1985 100644 --- a/configs/eval_subjective_judge_pandalm.py +++ b/configs/eval_subjective_judge_pandalm.py @@ -3,7 +3,7 @@ with read_base(): from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets -from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.partitioners import NaivePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.runners import LocalRunner diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py index 0d682b655..38a442e45 100644 --- a/configs/eval_subjective_mtbench.py +++ b/configs/eval_subjective_mtbench.py @@ -4,7 +4,6 @@ from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner diff --git a/configs/models/hf_internlm/README.md b/configs/models/hf_internlm/README.md new file mode 100644 index 000000000..3c4b19d08 --- /dev/null +++ b/configs/models/hf_internlm/README.md @@ -0,0 +1,124 @@ +# InternLM Model Details + +## InternLM + +InternLM is pre-trained on a large corpora with 1.6T tokens with a multi-phase progressive process, and then fine-tuned to align with human preferences. We also developed a training system called Uniscale-LLM for efficient large language model training. The evaluation on a number of benchmarks shows that InternLM achieves state-of-the-art performance in multiple aspects, including knowledge understanding, reading comprehension, mathematics, and coding. With such well-rounded capabilities, InternLM achieves outstanding performances on comprehensive exams, including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without resorting to external tools. On these benchmarks, InternLM not only significantly outperforms open-source models, but also obtains superior performance compared to ChatGPT. Also, InternLM demonstrates excellent capability of understanding Chinese language and Chinese culture, which makes it a suitable foundation model to support Chinese-oriented language applications. + +## InternLM2 + +The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k "Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution. + +# Evaluation Command + +## Base Models + +```bash +python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug +python3 run.py --models hf_internlm2_7b --datasets cmmlu_ppl_041cbf --debug +python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug +python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug +python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug +python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug +python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug +python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug +python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug +python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug +python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug +python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug +python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug +python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug +``` + +## Chat Models + +```bash +python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug +python3 run.py --models hf_internlm2_chat_7b --datasets cmmlu_gen_c13365 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug +python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug +python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug +python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug +python3 run.py --models hf_internlm2_chat_7b --datasets lcbench_gen_5ff288 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug +python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug +``` + +# Benchmarks + +We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands. + +## Base Models + +| dataset | internlm2-1.8b-turbomind | internlm2-7b-turbomind | internlm2-20b-turbomind | +|:------------:|---------------------------:|-------------------------:|--------------------------:| +| mmlu | 45.99 | 65.84 | 67.58 | +| cmmlu | 45.27 | 66.17 | 68.28 | +| ceval-test | 44.79 | 63.54 | 67.28 | +| GaokaoBench | 23.78 | 41.41 | 58.99 | +| triviaqa | 38.42 | 69.15 | 74.03 | +| nq | 20.66 | 41.05 | 43.55 | +| race-high | 64.72 | 72.56 | 72.90 | +| winogrande | 66.77 | 83.50 | 84.69 | +| hellaswag | 44.86 | 89.52 | 91.41 | +| bbh | 36.03 | 63.56 | 71.29 | +| gsm8k | 30.40 | 69.98 | 76.80 | +| math | 9.42 | 25.16 | 32.24 | +| TheoremQA | 10.50 | 21.88 | 26.00 | +| humaneval | 30.49 | 48.17 | 51.83 | +| mbpp | 30.74 | 54.47 | 59.92 | +| lcbench | 4.34 | 12.16 | 18.46 | +| GPQA_diamond | 24.24 | 28.28 | 31.31 | + +## Chat Models + +| dataset | internlm2-chat-1.8b-hf | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-hf | internlm2-chat-20b-sft-hf | +|:------------:|-------------------------:|-----------------------------:|-----------------------:|---------------------------:|------------------------:|----------------------------:| +| mmlu | 47.58 | 47.44 | 63.05 | 63.33 | 67.37 | 67.34 | +| cmmlu | 46.11 | 46.27 | 62.10 | 62.38 | 66.26 | 66.39 | +| ceval-test | 47.04 | 47.19 | 58.75 | 58.96 | 63.12 | 63.16 | +| GaokaoBench | 29.73 | 28.79 | 54.54 | 55.39 | 57.95 | 57.62 | +| triviaqa | 46.69 | 46.50 | 69.54 | 70.75 | 75.53 | 75.90 | +| nq | 19.09 | 18.14 | 28.73 | 30.78 | 28.75 | 34.10 | +| race-high | 73.87 | 73.81 | 84.51 | 84.88 | 88.02 | 88.11 | +| winogrande | 57.62 | 57.93 | 73.56 | 73.80 | 81.06 | 81.37 | +| hellaswag | 60.47 | 61.58 | 84.80 | 85.21 | 88.48 | 88.95 | +| bbh | 37.69 | 37.12 | 57.83 | 57.19 | 68.24 | 69.38 | +| gsm8k | 39.73 | 36.85 | 69.90 | 69.83 | 75.21 | 76.95 | +| math | 14.06 | 13.10 | 28.08 | 27.60 | 34.68 | 32.54 | +| TheoremQA | 13.63 | 12.88 | 18.50 | 18.75 | 23.00 | 25.12 | +| humaneval | 33.54 | 34.15 | 56.71 | 61.59 | 67.68 | 67.68 | +| mbpp | 39.69 | 36.19 | 57.59 | 55.64 | 68.87 | 69.65 | +| lcbench | 4.52 | 3.56 | 14.60 | 14.34 | 19.64 | 20.55 | +| GPQA_diamond | 25.76 | 26.26 | 28.28 | 27.27 | 30.30 | 29.29 | +| IFEval | 18.30 | 18.67 | 34.75 | 39.19 | 36.41 | 44.55 | + +# Citation + +```BibTeX +@misc{2023internlm, + title={InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities}, + author={InternLM Team}, + howpublished = {\url{https://github.com/InternLM/InternLM-techreport}}, + year={2023} +} +@misc{cai2024internlm2, + title={InternLM2 Technical Report}, + author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin}, + year={2024}, + eprint={2403.17297}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py new file mode 100644 index 000000000..346e93419 --- /dev/null +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py @@ -0,0 +1,23 @@ +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='internlm2-chat-1.8b-turbomind', + path='internlm/internlm2-chat-1_8b', + engine_config=dict( + max_batch_size=16, + tp=1, + ), + gen_config=dict( + top_k=1, + temperature=1e-6, + top_p=0.9, + ), + max_seq_len=2048, + max_out_len=1024, + batch_size=32768, + run_cfg=dict(num_gpus=1), + stop_words=['', '<|im_end|>'], + ) +] diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_series.py b/configs/models/hf_internlm/lmdeploy_internlm2_series.py index f01ee7142..6c2868340 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_series.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_series.py @@ -3,7 +3,9 @@ settings = [ ('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1), ('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1), + ('internlm2-base-7b-turbomind', 'internlm/internlm2-base-7b', 1), ('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2), + ('internlm2-base-20b-turbomind', 'internlm/internlm2-base-20b', 2), ] models = [] diff --git a/configs/models/hf_llama/hf_llama2_13b.py b/configs/models/hf_llama/hf_llama2_13b.py index 4044f87e1..9d51ac0c6 100644 --- a/configs/models/hf_llama/hf_llama2_13b.py +++ b/configs/models/hf_llama/hf_llama2_13b.py @@ -7,6 +7,6 @@ path='meta-llama/Llama-2-13b-hf', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1), + run_cfg=dict(num_gpus=2), ) ] diff --git a/configs/models/hf_llama/hf_llama2_13b_chat.py b/configs/models/hf_llama/hf_llama2_13b_chat.py index 8460ad422..bd2b54be4 100644 --- a/configs/models/hf_llama/hf_llama2_13b_chat.py +++ b/configs/models/hf_llama/hf_llama2_13b_chat.py @@ -7,6 +7,6 @@ path='meta-llama/Llama-2-13b-chat-hf', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1), + run_cfg=dict(num_gpus=2), ) ] diff --git a/configs/models/hf_llama/hf_llama2_70b.py b/configs/models/hf_llama/hf_llama2_70b.py index 97d28a4b4..0aad6e66a 100644 --- a/configs/models/hf_llama/hf_llama2_70b.py +++ b/configs/models/hf_llama/hf_llama2_70b.py @@ -7,6 +7,6 @@ path='meta-llama/Llama-2-70b-hf', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=4), + run_cfg=dict(num_gpus=8), ) ] diff --git a/configs/models/hf_llama/hf_llama3_70b.py b/configs/models/hf_llama/hf_llama3_70b.py index b3cce9504..32ab4f553 100644 --- a/configs/models/hf_llama/hf_llama3_70b.py +++ b/configs/models/hf_llama/hf_llama3_70b.py @@ -7,6 +7,6 @@ path='meta-llama/Meta-Llama-3-70B', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1), + run_cfg=dict(num_gpus=8), ) ] diff --git a/configs/models/hf_llama/hf_llama_13b.py b/configs/models/hf_llama/hf_llama_13b.py index 70d1b9cb8..6c6976140 100644 --- a/configs/models/hf_llama/hf_llama_13b.py +++ b/configs/models/hf_llama/hf_llama_13b.py @@ -7,6 +7,6 @@ path='huggyllama/llama-13b', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1), + run_cfg=dict(num_gpus=2), ) ] diff --git a/configs/models/hf_llama/hf_llama_30b.py b/configs/models/hf_llama/hf_llama_30b.py index 063a69275..50d0f80d1 100644 --- a/configs/models/hf_llama/hf_llama_30b.py +++ b/configs/models/hf_llama/hf_llama_30b.py @@ -7,6 +7,6 @@ path='huggyllama/llama-30b', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=2), + run_cfg=dict(num_gpus=4), ) ] diff --git a/configs/models/hf_llama/hf_llama_65b.py b/configs/models/hf_llama/hf_llama_65b.py index 9db5fcc95..a57612999 100644 --- a/configs/models/hf_llama/hf_llama_65b.py +++ b/configs/models/hf_llama/hf_llama_65b.py @@ -7,6 +7,6 @@ path='huggyllama/llama-65b', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=4), + run_cfg=dict(num_gpus=8), ) ] diff --git a/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py b/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py new file mode 100644 index 000000000..fdc51c8d6 --- /dev/null +++ b/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='mistral-7b-instruct-v0.3-hf', + path='mistralai/Mistral-7B-Instruct-v0.3', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/mistral/hf_mistral_7b_v0_3.py b/configs/models/mistral/hf_mistral_7b_v0_3.py new file mode 100644 index 000000000..90d9a829e --- /dev/null +++ b/configs/models/mistral/hf_mistral_7b_v0_3.py @@ -0,0 +1,13 @@ +from opencompass.models import HuggingFaceBaseModel + + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='mistral-7b-v0.3-hf', + path='mistralai/Mistral-7B-v0.3', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen/README.md b/configs/models/qwen/README.md new file mode 100644 index 000000000..f8dbd8adf --- /dev/null +++ b/configs/models/qwen/README.md @@ -0,0 +1,142 @@ +# Qwen Model Details + +## Qwen + +Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models. + +## Qwen1.5 + +Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. In comparison with the previous released Qwen, the improvements include: + +- 8 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, 32B and 72B dense models, and an MoE model of 14B with 2.7B activated; +- Significant performance improvement in human preference for chat models; +- Multilingual support of both base and chat models; +- Stable support of 32K context length for models of all sizes +- No need of trust_remote_code. + +# Evaluation Command + +## Base Models + +```bash +python3 run.py --models hf_qwen1_5_7b --datasets mmlu_ppl_ac766d --debug +python3 run.py --models hf_qwen1_5_7b --datasets cmmlu_ppl_041cbf --debug +python3 run.py --models hf_qwen1_5_7b --datasets ceval_internal_ppl_93e5ce --debug +python3 run.py --models hf_qwen1_5_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug +python3 run.py --models hf_qwen1_5_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug +python3 run.py --models hf_qwen1_5_7b --datasets nq_open_1shot_gen_20a989 --debug +python3 run.py --models hf_qwen1_5_7b --datasets race_ppl_abed12 --debug +python3 run.py --models hf_qwen1_5_7b --datasets winogrande_5shot_ll_252f01 --debug +python3 run.py --models hf_qwen1_5_7b --datasets hellaswag_10shot_ppl_59c85e --debug +python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug +python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug +python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug +python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug +python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug +python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug +python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug +``` + +## Chat Models + +```bash +python3 run.py --models hf_qwen1_5_7b_chat --datasets mmlu_gen_4d595a --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets cmmlu_gen_c13365 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets ceval_internal_gen_2daf24 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets GaokaoBench_no_subjective_gen_4c31db --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets triviaqa_wiki_1shot_gen_eaf81e --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets nq_open_1shot_gen_01cf41 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets race_gen_69ee4f --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets winogrande_5shot_gen_b36770 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets hellaswag_10shot_gen_e42710 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets bbh_gen_5b92b0 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets gsm8k_gen_1d7fe4 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets math_0shot_gen_393424 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets humaneval_gen_8e312c --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets sanitized_mbpp_mdblock_gen_a447ff --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets lcbench_gen_5ff288 --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets gpqa_gen_4baadb --debug +python3 run.py --models hf_qwen1_5_7b_chat --datasets IFEval_gen_3321a3 --debug +``` + +# Benchmarks + +We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands. + +## Base Models + +| dataset | qwen-1.8b-turbomind | qwen-7b-turbomind | qwen-14b-turbomind | qwen-72b-turbomind | +|:------------:|----------------------:|--------------------:|---------------------:|---------------------:| +| mmlu | 46.61 | 59.75 | 67.85 | 77.36 | +| cmmlu | 51.98 | 62.10 | 70.05 | 83.32 | +| ceval-test | 54.24 | 62.06 | 70.33 | 83.25 | +| GaokaoBench | 22.11 | 35.32 | 54.07 | 77.56 | +| triviaqa | 22.76 | 53.61 | 49.72 | 79.13 | +| nq | 5.68 | 17.87 | 13.77 | 18.20 | +| race-high | 63.09 | 80.30 | 88.11 | 90.62 | +| winogrande | 61.25 | 72.06 | 72.45 | 82.56 | +| hellaswag | 38.04 | 64.62 | 85.88 | 90.40 | +| bbh | 22.53 | 45.89 | 56.75 | 63.35 | +| gsm8k | 23.73 | 54.36 | 61.64 | 79.68 | +| math | 6.30 | 15.56 | 30.38 | 44.18 | +| TheoremQA | 9.38 | 15.00 | 21.62 | 27.12 | +| humaneval | 16.46 | 23.78 | 23.78 | 66.46 | +| mbpp | 2.72 | 46.69 | 55.64 | 65.76 | +| lcbench | 1.82 | 4.95 | 8.86 | 16.86 | +| GPQA_diamond | 28.79 | 24.75 | 27.78 | 31.31 | + +| dataset | qwen1.5-0.5b-hf | qwen1.5-1.8b-hf | qwen1.5-4b-hf | qwen1.5-7b-hf | qwen1.5-14b-hf | qwen1.5-32b-hf | qwen1.5-72b-hf | +|:------------:|------------------:|------------------:|----------------:|----------------:|-----------------:|-----------------:|-----------------:| +| mmlu | 39.98 | 47.14 | 57.03 | 62.15 | 69.10 | 73.88 | 77.02 | +| cmmlu | 46.05 | 57.45 | 66.38 | 71.86 | 76.95 | 81.58 | 83.00 | +| ceval-test | 48.36 | 58.67 | 66.55 | 72.49 | 76.93 | 82.50 | 83.03 | +| GaokaoBench | 30.67 | 35.66 | 54.31 | 65.99 | 66.60 | 79.01 | 80.26 | +| triviaqa | 21.24 | 34.32 | 44.59 | 56.60 | 59.96 | 56.20 | 77.81 | +| nq | 6.01 | 10.28 | 15.73 | 18.61 | 16.07 | 21.75 | 20.53 | +| race-high | 54.66 | 67.27 | 78.50 | 82.73 | 87.99 | 90.57 | 90.45 | +| winogrande | 57.38 | 60.46 | 65.90 | 70.01 | 72.93 | 78.69 | 80.74 | +| hellaswag | 29.19 | 42.32 | 55.89 | 68.51 | 83.86 | 87.28 | 90.41 | +| bbh | 20.54 | 27.01 | 34.81 | 39.87 | 50.38 | 67.47 | 58.81 | +| gsm8k | 13.27 | 34.87 | 47.61 | 54.36 | 63.53 | 72.71 | 79.53 | +| math | 4.16 | 11.32 | 17.50 | 17.34 | 36.18 | 45.74 | 41.56 | +| TheoremQA | 5.88 | 12.00 | 13.75 | 4.25 | 12.62 | 26.62 | 26.62 | +| humaneval | 8.54 | 23.17 | 41.46 | 53.05 | 57.32 | 70.12 | 65.85 | +| mbpp | 5.06 | 15.95 | 45.91 | 52.14 | 52.14 | 59.14 | 61.09 | +| lcbench | 0.87 | 2.00 | 5.65 | 6.69 | 12.69 | 14.34 | 15.29 | +| GPQA_diamond | 23.74 | 28.79 | 23.23 | 20.71 | 32.32 | 30.81 | 31.82 | + +## Chat Models + +| dataset | qwen1.5-0.5b-chat-hf | qwen1.5-1.8b-chat-hf | qwen1.5-4b-chat-hf | qwen1.5-7b-chat-hf | qwen1.5-14b-chat-hf | qwen1.5-32b-chat-hf | qwen1.5-72b-chat-hf | qwen1.5-110b-chat-hf | +|:------------:|-----------------------:|-----------------------:|---------------------:|---------------------:|----------------------:|----------------------:|----------------------:|-----------------------:| +| mmlu | 35.32 | 45.62 | 55.90 | 61.79 | 67.96 | 75.36 | 77.24 | 77.95 | +| cmmlu | 31.55 | 48.93 | 58.53 | 68.78 | 75.07 | 80.39 | 82.48 | 86.46 | +| ceval-test | 36.88 | 55.17 | 61.54 | 68.71 | 74.80 | 80.47 | 81.53 | 87.33 | +| GaokaoBench | 21.51 | 46.19 | 59.11 | 70.55 | 80.39 | 86.15 | 88.58 | 89.59 | +| triviaqa | 19.84 | 35.81 | 48.93 | 53.65 | 62.58 | 74.72 | 83.25 | 86.20 | +| nq | 7.42 | 10.22 | 19.31 | 16.87 | 20.53 | 25.26 | 35.21 | 36.98 | +| race-high | 49.03 | 66.24 | 73.53 | 83.28 | 87.51 | 91.22 | 91.11 | 92.31 | +| winogrande | 50.51 | 51.07 | 57.54 | 65.27 | 70.09 | 77.90 | 80.82 | 82.32 | +| hellaswag | 29.60 | 41.71 | 60.45 | 71.58 | 79.70 | 88.56 | 89.37 | 91.11 | +| bbh | 24.12 | 26.82 | 43.15 | 38.12 | 55.38 | 69.28 | 72.97 | 71.04 | +| gsm8k | 8.79 | 27.60 | 47.61 | 56.25 | 64.90 | 79.91 | 77.03 | 79.53 | +| math | 0.56 | 4.94 | 7.34 | 22.14 | 32.22 | 41.80 | 45.22 | 54.38 | +| TheoremQA | 9.00 | 9.25 | 13.88 | 12.25 | 13.63 | 19.25 | 22.75 | 17.50 | +| humaneval | 9.15 | 15.85 | 30.49 | 40.85 | 50.00 | 57.93 | 60.37 | 65.24 | +| mbpp | 11.28 | 22.57 | 43.58 | 50.58 | 56.03 | 65.37 | 66.93 | 68.48 | +| lcbench | 0.00 | 1.65 | 5.56 | 8.78 | 14.42 | 10.78 | 18.77 | 34.58 | +| GPQA_diamond | 19.70 | 29.80 | 25.25 | 31.82 | 30.30 | 31.31 | 32.83 | 35.86 | +| IFEval | 13.12 | 16.08 | 25.51 | 38.82 | 42.51 | 49.54 | 51.02 | 55.08 | + +# Citation + +```BibTeX +@article{qwen, + title={Qwen Technical Report}, + author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu}, + journal={arXiv preprint arXiv:2309.16609}, + year={2023} +} +``` diff --git a/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py b/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py index 77f0fc57e..0e112c8bf 100644 --- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py +++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py @@ -1,25 +1,12 @@ -from opencompass.models import HuggingFaceCausalLM +from opencompass.models import HuggingFaceBaseModel models = [ dict( - type=HuggingFaceCausalLM, - abbr='qwen1.5-moe-a2-7b-hf', + type=HuggingFaceBaseModel, + abbr='qwen1.5-moe-a2.7b-hf', path='Qwen/Qwen1.5-MoE-A2.7B', - tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False, - ), - pad_token_id=151645, - max_out_len=100, - max_seq_len=2048, + max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=1), ) ] diff --git a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py index ebc09facc..d33ea7047 100644 --- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py @@ -1,33 +1,12 @@ -from opencompass.models import HuggingFaceCausalLM - -_meta_template = dict( - round=[ - dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), - ], -) +from opencompass.models import HuggingFacewithChatTemplate models = [ dict( - type=HuggingFaceCausalLM, - abbr='qwen1.5-moe-a2-7b-chat-hf', + type=HuggingFacewithChatTemplate, + abbr='qwen1.5-moe-a2.7b-chat-hf', path='Qwen/Qwen1.5-MoE-A2.7B-Chat', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False, - ), - meta_template=_meta_template, - max_out_len=100, - max_seq_len=2048, + max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=4, num_procs=1), - end_str='<|im_end|>', - batch_padding=True, + run_cfg=dict(num_gpus=1), ) ] diff --git a/configs/models/qwen/hf_qwen2_0_5b.py b/configs/models/qwen/hf_qwen2_0_5b.py new file mode 100644 index 000000000..22305d6fc --- /dev/null +++ b/configs/models/qwen/hf_qwen2_0_5b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen2-0.5b-hf', + path='Qwen/Qwen2-0.5B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen/hf_qwen2_1_5b.py b/configs/models/qwen/hf_qwen2_1_5b.py new file mode 100644 index 000000000..2c9d488a3 --- /dev/null +++ b/configs/models/qwen/hf_qwen2_1_5b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen2-1.5b-hf', + path='Qwen/Qwen2-1.5B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen/hf_qwen2_72b.py b/configs/models/qwen/hf_qwen2_72b.py new file mode 100644 index 000000000..97cc74f6f --- /dev/null +++ b/configs/models/qwen/hf_qwen2_72b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen2-72b-hf', + path='Qwen/Qwen2-72B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=8), + ) +] diff --git a/configs/models/qwen/hf_qwen2_7b.py b/configs/models/qwen/hf_qwen2_7b.py new file mode 100644 index 000000000..a861151f8 --- /dev/null +++ b/configs/models/qwen/hf_qwen2_7b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen2-7b-hf', + path='Qwen/Qwen2-7B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/qwen/hf_qwen2_moe_57b_a14b.py b/configs/models/qwen/hf_qwen2_moe_57b_a14b.py new file mode 100644 index 000000000..adc0ba8f3 --- /dev/null +++ b/configs/models/qwen/hf_qwen2_moe_57b_a14b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen2-moe-57b-a14b-hf', + path='Qwen/Qwen2-MoE-57B-A14B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen/hf_qwen_14b.py b/configs/models/qwen/hf_qwen_14b.py index 8c15c0329..ac0985833 100644 --- a/configs/models/qwen/hf_qwen_14b.py +++ b/configs/models/qwen/hf_qwen_14b.py @@ -7,6 +7,6 @@ path='Qwen/Qwen-14B', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=1), + run_cfg=dict(num_gpus=2), ) ] diff --git a/configs/models/qwen/hf_qwen_14b_chat.py b/configs/models/qwen/hf_qwen_14b_chat.py index 4d9cccee6..781451e11 100644 --- a/configs/models/qwen/hf_qwen_14b_chat.py +++ b/configs/models/qwen/hf_qwen_14b_chat.py @@ -1,12 +1,31 @@ -from opencompass.models import HuggingFacewithChatTemplate +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), + ], +) models = [ dict( - type=HuggingFacewithChatTemplate, + type=HuggingFaceCausalLM, abbr='qwen-14b-chat-hf', path='Qwen/Qwen-14B-Chat', + model_kwargs=dict(device_map='auto', trust_remote_code=True), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + pad_token_id=151643, max_out_len=1024, + max_seq_len=8192, batch_size=8, - run_cfg=dict(num_gpus=1), + batch_padding=True, + meta_template=_meta_template, + run_cfg=dict(num_gpus=2), + end_str='<|im_end|>', ) ] diff --git a/configs/models/qwen/hf_qwen_1_8b_chat.py b/configs/models/qwen/hf_qwen_1_8b_chat.py index 1838a04aa..05b775644 100644 --- a/configs/models/qwen/hf_qwen_1_8b_chat.py +++ b/configs/models/qwen/hf_qwen_1_8b_chat.py @@ -1,12 +1,31 @@ -from opencompass.models import HuggingFacewithChatTemplate +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), + ], +) models = [ dict( - type=HuggingFacewithChatTemplate, + type=HuggingFaceCausalLM, abbr='qwen-1.8b-chat-hf', path='Qwen/Qwen-1_8B-Chat', + model_kwargs=dict(device_map='auto', trust_remote_code=True), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + pad_token_id=151643, max_out_len=1024, + max_seq_len=8192, batch_size=8, + batch_padding=True, + meta_template=_meta_template, run_cfg=dict(num_gpus=1), + end_str='<|im_end|>', ) ] diff --git a/configs/models/qwen/hf_qwen_72b.py b/configs/models/qwen/hf_qwen_72b.py index 325315b8e..63eacfefa 100644 --- a/configs/models/qwen/hf_qwen_72b.py +++ b/configs/models/qwen/hf_qwen_72b.py @@ -7,6 +7,6 @@ path='Qwen/Qwen-72B', max_out_len=1024, batch_size=8, - run_cfg=dict(num_gpus=4), + run_cfg=dict(num_gpus=8), ) ] diff --git a/configs/models/qwen/hf_qwen_72b_chat.py b/configs/models/qwen/hf_qwen_72b_chat.py index 255aeb5db..bf214abb5 100644 --- a/configs/models/qwen/hf_qwen_72b_chat.py +++ b/configs/models/qwen/hf_qwen_72b_chat.py @@ -1,12 +1,31 @@ -from opencompass.models import HuggingFacewithChatTemplate +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), + ], +) models = [ dict( - type=HuggingFacewithChatTemplate, + type=HuggingFaceCausalLM, abbr='qwen-72b-chat-hf', path='Qwen/Qwen-72B-Chat', + model_kwargs=dict(device_map='auto', trust_remote_code=True), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + pad_token_id=151643, max_out_len=1024, + max_seq_len=8192, batch_size=8, - run_cfg=dict(num_gpus=4), + batch_padding=True, + meta_template=_meta_template, + run_cfg=dict(num_gpus=8), + end_str='<|im_end|>', ) ] diff --git a/configs/models/qwen/hf_qwen_7b_chat.py b/configs/models/qwen/hf_qwen_7b_chat.py index e5479fb6e..9fe97f836 100644 --- a/configs/models/qwen/hf_qwen_7b_chat.py +++ b/configs/models/qwen/hf_qwen_7b_chat.py @@ -1,12 +1,31 @@ -from opencompass.models import HuggingFacewithChatTemplate +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True), + ], +) models = [ dict( - type=HuggingFacewithChatTemplate, + type=HuggingFaceCausalLM, abbr='qwen-7b-chat-hf', path='Qwen/Qwen-7B-Chat', + model_kwargs=dict(device_map='auto', trust_remote_code=True), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + pad_token_id=151643, max_out_len=1024, + max_seq_len=8192, batch_size=8, + batch_padding=True, + meta_template=_meta_template, run_cfg=dict(num_gpus=1), + end_str='<|im_end|>', ) ] diff --git a/configs/models/yi/hf_yi_1_5_34b.py b/configs/models/yi/hf_yi_1_5_34b.py new file mode 100644 index 000000000..38cd64245 --- /dev/null +++ b/configs/models/yi/hf_yi_1_5_34b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='yi-1.5-34b-hf', + path='01-ai/Yi-1.5-34B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=2), + ) +] diff --git a/configs/models/yi/hf_yi_1_5_6b.py b/configs/models/yi/hf_yi_1_5_6b.py new file mode 100644 index 000000000..df4fd23c4 --- /dev/null +++ b/configs/models/yi/hf_yi_1_5_6b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='yi-1.5-6b-hf', + path='01-ai/Yi-1.5-6B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/models/yi/hf_yi_1_5_9b.py b/configs/models/yi/hf_yi_1_5_9b.py new file mode 100644 index 000000000..c38506a4a --- /dev/null +++ b/configs/models/yi/hf_yi_1_5_9b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='yi-1.5-9b-hf', + path='01-ai/Yi-1.5-9B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + ) +] diff --git a/configs/subjective/model_cfg.py b/configs/subjective/model_cfg.py index cac3e9df5..4b95f2a7e 100644 --- a/configs/subjective/model_cfg.py +++ b/configs/subjective/model_cfg.py @@ -1,5 +1,5 @@ from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 -from opencompass.models.openai_api import OpenAIAllesAPIN +from opencompass.models.openai_api import OpenAI from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner @@ -44,7 +44,7 @@ judge_model = dict( abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, path='gpt-4-1106-preview', + type=OpenAI, path='gpt-4-1106-preview', key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well url='', meta_template=api_meta_template, @@ -70,7 +70,7 @@ gpt4 = dict( abbr='gpt4-turbo', - type=OpenAIAllesAPIN, + type=OpenAI, path='gpt-4-1106-preview', key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well meta_template=api_meta_template, diff --git a/configs/summarizers/charm_rea.py b/configs/summarizers/charm_rea.py deleted file mode 100644 index 953b24539..000000000 --- a/configs/summarizers/charm_rea.py +++ /dev/null @@ -1,98 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .groups.charm_rea import charm_rea_summary_groups - -summarizer = dict( - dataset_abbrs=[ - 'charm-rea-Direct', - 'charm-rea-ZH-CoT', - 'charm-rea-EN-CoT', - 'charm-rea-XLT', - 'charm-rea-Translate-EN', - '', - 'charm-rea-Chinese_Direct', - 'charm-rea-Chinese_ZH-CoT', - 'charm-rea-Chinese_EN-CoT', - 'charm-rea-Chinese_XLT', - 'charm-rea-Chinese_Translate-EN', - 'charm-rea-Global_Direct', - 'charm-rea-Global_ZH-CoT', - 'charm-rea-Global_EN-CoT', - 'charm-rea-Global_XLT', - 'charm-rea-Global_Translate-EN', - '', - 'charm-rea-Chinese_Anachronisms_Judgment_Direct', - 'charm-rea-Chinese_Movie_and_Music_Recommendation_Direct', - 'charm-rea-Chinese_Natural_Language_Inference_Direct', - 'charm-rea-Chinese_Reading_Comprehension_Direct', - 'charm-rea-Chinese_Sequence_Understanding_Direct', - 'charm-rea-Chinese_Sport_Understanding_Direct', - 'charm-rea-Chinese_Time_Understanding_Direct', - 'charm-rea-Global_Anachronisms_Judgment_Direct', - 'charm-rea-Global_Movie_and_Music_Recommendation_Direct', - 'charm-rea-Global_Natural_Language_Inference_Direct', - 'charm-rea-Global_Reading_Comprehension_Direct', - 'charm-rea-Global_Sequence_Understanding_Direct', - 'charm-rea-Global_Sport_Understanding_Direct', - 'charm-rea-Global_Time_Understanding_Direct', - 'charm-rea-Chinese_Anachronisms_Judgment_ZH-CoT', - 'charm-rea-Chinese_Movie_and_Music_Recommendation_ZH-CoT', - 'charm-rea-Chinese_Natural_Language_Inference_ZH-CoT', - 'charm-rea-Chinese_Reading_Comprehension_ZH-CoT', - 'charm-rea-Chinese_Sequence_Understanding_ZH-CoT', - 'charm-rea-Chinese_Sport_Understanding_ZH-CoT', - 'charm-rea-Chinese_Time_Understanding_ZH-CoT', - 'charm-rea-Global_Anachronisms_Judgment_ZH-CoT', - 'charm-rea-Global_Movie_and_Music_Recommendation_ZH-CoT', - 'charm-rea-Global_Natural_Language_Inference_ZH-CoT', - 'charm-rea-Global_Reading_Comprehension_ZH-CoT', - 'charm-rea-Global_Sequence_Understanding_ZH-CoT', - 'charm-rea-Global_Sport_Understanding_ZH-CoT', - 'charm-rea-Global_Time_Understanding_ZH-CoT', - 'charm-rea-Chinese_Anachronisms_Judgment_EN-CoT', - 'charm-rea-Chinese_Movie_and_Music_Recommendation_EN-CoT', - 'charm-rea-Chinese_Natural_Language_Inference_EN-CoT', - 'charm-rea-Chinese_Reading_Comprehension_EN-CoT', - 'charm-rea-Chinese_Sequence_Understanding_EN-CoT', - 'charm-rea-Chinese_Sport_Understanding_EN-CoT', - 'charm-rea-Chinese_Time_Understanding_EN-CoT', - 'charm-rea-Global_Anachronisms_Judgment_EN-CoT', - 'charm-rea-Global_Movie_and_Music_Recommendation_EN-CoT', - 'charm-rea-Global_Natural_Language_Inference_EN-CoT', - 'charm-rea-Global_Reading_Comprehension_EN-CoT', - 'charm-rea-Global_Sequence_Understanding_EN-CoT', - 'charm-rea-Global_Sport_Understanding_EN-CoT', - 'charm-rea-Global_Time_Understanding_EN-CoT', - 'charm-rea-Chinese_Anachronisms_Judgment_XLT', - 'charm-rea-Chinese_Movie_and_Music_Recommendation_XLT', - 'charm-rea-Chinese_Natural_Language_Inference_XLT', - 'charm-rea-Chinese_Reading_Comprehension_XLT', - 'charm-rea-Chinese_Sequence_Understanding_XLT', - 'charm-rea-Chinese_Sport_Understanding_XLT', - 'charm-rea-Chinese_Time_Understanding_XLT', - 'charm-rea-Global_Anachronisms_Judgment_XLT', - 'charm-rea-Global_Movie_and_Music_Recommendation_XLT', - 'charm-rea-Global_Natural_Language_Inference_XLT', - 'charm-rea-Global_Reading_Comprehension_XLT', - 'charm-rea-Global_Sequence_Understanding_XLT', - 'charm-rea-Global_Sport_Understanding_XLT', - 'charm-rea-Global_Time_Understanding_XLT', - 'charm-rea-Chinese_Anachronisms_Judgment_Translate-EN', - 'charm-rea-Chinese_Movie_and_Music_Recommendation_Translate-EN', - 'charm-rea-Chinese_Natural_Language_Inference_Translate-EN', - 'charm-rea-Chinese_Reading_Comprehension_Translate-EN', - 'charm-rea-Chinese_Sequence_Understanding_Translate-EN', - 'charm-rea-Chinese_Sport_Understanding_Translate-EN', - 'charm-rea-Chinese_Time_Understanding_Translate-EN', - 'charm-rea-Global_Anachronisms_Judgment_Translate-EN', - 'charm-rea-Global_Movie_and_Music_Recommendation_Translate-EN', - 'charm-rea-Global_Natural_Language_Inference_Translate-EN', - 'charm-rea-Global_Reading_Comprehension_Translate-EN', - 'charm-rea-Global_Sequence_Understanding_Translate-EN', - 'charm-rea-Global_Sport_Understanding_Translate-EN', - 'charm-rea-Global_Time_Understanding_Translate-EN', - ], - summary_groups=sum( - [v for k, v in locals().items() if k.endswith('_summary_groups')], []) -) diff --git a/configs/summarizers/charm_reason.py b/configs/summarizers/charm_reason.py new file mode 100644 index 000000000..8e12e9017 --- /dev/null +++ b/configs/summarizers/charm_reason.py @@ -0,0 +1,98 @@ +from mmengine.config import reasond_base + +with reasond_base(): + from .groups.charm_reason import charm_reason_summary_groups + +summarizer = dict( + dataset_abbrs=[ + 'charm-reason-Direct', + 'charm-reason-ZH-CoT', + 'charm-reason-EN-CoT', + 'charm-reason-XLT', + 'charm-reason-Translate-EN', + '', + 'charm-reason-Chinese_Direct', + 'charm-reason-Chinese_ZH-CoT', + 'charm-reason-Chinese_EN-CoT', + 'charm-reason-Chinese_XLT', + 'charm-reason-Chinese_Translate-EN', + 'charm-reason-Global_Direct', + 'charm-reason-Global_ZH-CoT', + 'charm-reason-Global_EN-CoT', + 'charm-reason-Global_XLT', + 'charm-reason-Global_Translate-EN', + '', + 'charm-reason-Chinese_Anachronisms_Judgment_Direct', + 'charm-reason-Chinese_Movie_and_Music_Recommendation_Direct', + 'charm-reason-Chinese_Natural_Language_Inference_Direct', + 'charm-reason-Chinese_Reading_Comprehension_Direct', + 'charm-reason-Chinese_Sequence_Understanding_Direct', + 'charm-reason-Chinese_Sport_Understanding_Direct', + 'charm-reason-Chinese_Time_Understanding_Direct', + 'charm-reason-Global_Anachronisms_Judgment_Direct', + 'charm-reason-Global_Movie_and_Music_Recommendation_Direct', + 'charm-reason-Global_Natural_Language_Inference_Direct', + 'charm-reason-Global_Reading_Comprehension_Direct', + 'charm-reason-Global_Sequence_Understanding_Direct', + 'charm-reason-Global_Sport_Understanding_Direct', + 'charm-reason-Global_Time_Understanding_Direct', + 'charm-reason-Chinese_Anachronisms_Judgment_ZH-CoT', + 'charm-reason-Chinese_Movie_and_Music_Recommendation_ZH-CoT', + 'charm-reason-Chinese_Natural_Language_Inference_ZH-CoT', + 'charm-reason-Chinese_Reading_Comprehension_ZH-CoT', + 'charm-reason-Chinese_Sequence_Understanding_ZH-CoT', + 'charm-reason-Chinese_Sport_Understanding_ZH-CoT', + 'charm-reason-Chinese_Time_Understanding_ZH-CoT', + 'charm-reason-Global_Anachronisms_Judgment_ZH-CoT', + 'charm-reason-Global_Movie_and_Music_Recommendation_ZH-CoT', + 'charm-reason-Global_Natural_Language_Inference_ZH-CoT', + 'charm-reason-Global_Reading_Comprehension_ZH-CoT', + 'charm-reason-Global_Sequence_Understanding_ZH-CoT', + 'charm-reason-Global_Sport_Understanding_ZH-CoT', + 'charm-reason-Global_Time_Understanding_ZH-CoT', + 'charm-reason-Chinese_Anachronisms_Judgment_EN-CoT', + 'charm-reason-Chinese_Movie_and_Music_Recommendation_EN-CoT', + 'charm-reason-Chinese_Natural_Language_Inference_EN-CoT', + 'charm-reason-Chinese_Reading_Comprehension_EN-CoT', + 'charm-reason-Chinese_Sequence_Understanding_EN-CoT', + 'charm-reason-Chinese_Sport_Understanding_EN-CoT', + 'charm-reason-Chinese_Time_Understanding_EN-CoT', + 'charm-reason-Global_Anachronisms_Judgment_EN-CoT', + 'charm-reason-Global_Movie_and_Music_Recommendation_EN-CoT', + 'charm-reason-Global_Natural_Language_Inference_EN-CoT', + 'charm-reason-Global_Reading_Comprehension_EN-CoT', + 'charm-reason-Global_Sequence_Understanding_EN-CoT', + 'charm-reason-Global_Sport_Understanding_EN-CoT', + 'charm-reason-Global_Time_Understanding_EN-CoT', + 'charm-reason-Chinese_Anachronisms_Judgment_XLT', + 'charm-reason-Chinese_Movie_and_Music_Recommendation_XLT', + 'charm-reason-Chinese_Natural_Language_Inference_XLT', + 'charm-reason-Chinese_Reading_Comprehension_XLT', + 'charm-reason-Chinese_Sequence_Understanding_XLT', + 'charm-reason-Chinese_Sport_Understanding_XLT', + 'charm-reason-Chinese_Time_Understanding_XLT', + 'charm-reason-Global_Anachronisms_Judgment_XLT', + 'charm-reason-Global_Movie_and_Music_Recommendation_XLT', + 'charm-reason-Global_Natural_Language_Inference_XLT', + 'charm-reason-Global_Reading_Comprehension_XLT', + 'charm-reason-Global_Sequence_Understanding_XLT', + 'charm-reason-Global_Sport_Understanding_XLT', + 'charm-reason-Global_Time_Understanding_XLT', + 'charm-reason-Chinese_Anachronisms_Judgment_Translate-EN', + 'charm-reason-Chinese_Movie_and_Music_Recommendation_Translate-EN', + 'charm-reason-Chinese_Natural_Language_Inference_Translate-EN', + 'charm-reason-Chinese_Reading_Comprehension_Translate-EN', + 'charm-reason-Chinese_Sequence_Understanding_Translate-EN', + 'charm-reason-Chinese_Sport_Understanding_Translate-EN', + 'charm-reason-Chinese_Time_Understanding_Translate-EN', + 'charm-reason-Global_Anachronisms_Judgment_Translate-EN', + 'charm-reason-Global_Movie_and_Music_Recommendation_Translate-EN', + 'charm-reason-Global_Natural_Language_Inference_Translate-EN', + 'charm-reason-Global_Reading_Comprehension_Translate-EN', + 'charm-reason-Global_Sequence_Understanding_Translate-EN', + 'charm-reason-Global_Sport_Understanding_Translate-EN', + 'charm-reason-Global_Time_Understanding_Translate-EN', + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +) diff --git a/configs/summarizers/chat_OC15_multi_faceted.py b/configs/summarizers/chat_OC15_multi_faceted.py index 72dfa4cb0..c6fc58121 100644 --- a/configs/summarizers/chat_OC15_multi_faceted.py +++ b/configs/summarizers/chat_OC15_multi_faceted.py @@ -122,10 +122,10 @@ ['IFEval', 'Inst-level-loose-accuracy'], ] - summarizer = dict( type=MultiFacetedSummarizer, dataset_abbrs_list=[ + {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs}, {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs}, {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs}, @@ -143,7 +143,6 @@ {'name': 'humaneval', 'dataset_abbrs': [['openai_humaneval', 'humaneval_pass@1']]}, {'name': 'GPQA', 'dataset_abbrs': [['GPQA_diamond', 'accuracy']]}, {'name': 'IFEval', 'dataset_abbrs': IFEval_dataset_abbrs}, - {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs}, ], summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), ) diff --git a/configs/summarizers/groups/charm_rea.py b/configs/summarizers/groups/charm_rea.py deleted file mode 100644 index ade932758..000000000 --- a/configs/summarizers/groups/charm_rea.py +++ /dev/null @@ -1,30 +0,0 @@ -charm_tasks = [ - 'Anachronisms_Judgment', - 'Movie_and_Music_Recommendation', - 'Natural_Language_Inference', - 'Reading_Comprehension', - 'Sequence_Understanding', - 'Sport_Understanding', - 'Time_Understanding', -] -regions = [ - 'Chinese', - 'Global', -] -prompts = [ - 'Direct', - 'ZH-CoT', - 'EN-CoT', - 'XLT', - 'Translate-EN', -] - - -charm_rea_summary_groups = [] -for prompt in prompts: - for region in regions: - subsets = ['charm-rea-' + region + '_' + task + '_' + prompt for task in charm_tasks] - charm_rea_summary_groups.append({'name': 'charm-rea-' + region + '_' + prompt, 'subsets': subsets}) -for prompt in prompts: - subsets = ['charm-rea-' + region + '_' + prompt for region in regions] - charm_rea_summary_groups.append({'name': 'charm-rea-' + prompt, 'subsets': subsets}) diff --git a/configs/summarizers/groups/charm_reason.py b/configs/summarizers/groups/charm_reason.py new file mode 100644 index 000000000..52edc8d2b --- /dev/null +++ b/configs/summarizers/groups/charm_reason.py @@ -0,0 +1,35 @@ +charm_tasks = [ + 'Anachronisms_Judgment', + 'Movie_and_Music_Recommendation', + 'Natural_Language_Inference', + 'Reading_Comprehension', + 'Sequence_Understanding', + 'Sport_Understanding', + 'Time_Understanding', +] +regions = [ + 'Chinese', + 'Global', +] +prompts = [ + 'Direct', + 'ZH-CoT', + 'EN-CoT', + 'XLT', + 'Translate-EN', +] + + +charm_reaso_summary_groups = [] +for prompt in prompts: + for region in regions: + subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks] + charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets}) + +for prompt in prompts: + subsets = ['charm-reason-' + region + '_' + prompt for region in regions] + charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets}) + +charm_reaso_summary_groups.append( + {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']} +) diff --git a/configs/summarizers/groups/mathbench_2024.py b/configs/summarizers/groups/mathbench_2024.py new file mode 100644 index 000000000..05a11e35f --- /dev/null +++ b/configs/summarizers/groups/mathbench_2024.py @@ -0,0 +1,26 @@ + +mathbench_2024_wocircular_summary_groups = [ + {'name': 'college', 'subsets': ['college-single_choice_cn', 'college-single_choice_en']}, + {'name': 'high', 'subsets': ['high-single_choice_cn', 'high-single_choice_en']}, + {'name': 'middle', 'subsets': ['middle-single_choice_cn', 'middle-single_choice_en']}, + {'name': 'primary', 'subsets': ['primary-cloze_cn', 'primary-cloze_en']}, + {'name': 'cn', 'subsets': ['college-single_choice_cn', 'high-single_choice_cn', 'middle-single_choice_cn', 'primary-cloze_cn']}, + {'name': 'en', 'subsets': ['college-single_choice_en', 'high-single_choice_en', 'middle-single_choice_en', 'primary-cloze_en']}, + {'name': 'a', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic-cloze_en']}, + + {'name': 'college_knowledge', 'subsets': ['college_knowledge-single_choice_cn', 'college_knowledge-single_choice_en']}, + {'name': 'high_knowledge', 'subsets': ['high_knowledge-single_choice_cn', 'high_knowledge-single_choice_en']}, + {'name': 'middle_knowledge', 'subsets': ['middle_knowledge-single_choice_cn', 'middle_knowledge-single_choice_en']}, + {'name': 'primary_knowledge', 'subsets': ['primary_knowledge-single_choice_cn', 'primary_knowledge-single_choice_en']}, + {'name': 'knowledge-cn', 'subsets': ['college_knowledge-single_choice_cn', 'high_knowledge-single_choice_cn', 'middle_knowledge-single_choice_cn', 'primary_knowledge-single_choice_cn']}, + {'name': 'knowledge-en', 'subsets': ['college_knowledge-single_choice_en', 'high_knowledge-single_choice_en', 'middle_knowledge-single_choice_en', 'primary_knowledge-single_choice_en']}, + {'name': 't', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']}, + + {'name': 'overall', 'subsets': ['a', 't']}, +] + +for g in mathbench_2024_wocircular_summary_groups: + g['name'] = 'mathbench-wocircular-' + g['name'] + g['subsets'] = ['mathbench-wocircular-' + s for s in g['subsets']] + +mathbench_2024_summary_groups = mathbench_2024_wocircular_summary_groups diff --git a/opencompass/__init__.py b/opencompass/__init__.py index a6587aeab..13a85f774 100644 --- a/opencompass/__init__.py +++ b/opencompass/__init__.py @@ -1 +1 @@ -__version__ = '0.2.4' +__version__ = '0.2.5' diff --git a/opencompass/datasets/charm.py b/opencompass/datasets/charm.py index 0a0d6b046..474eb139d 100644 --- a/opencompass/datasets/charm.py +++ b/opencompass/datasets/charm.py @@ -11,8 +11,8 @@ from .base import BaseDataset -@TEXT_POSTPROCESSORS.register_module('charm-rea') -def charm_rea_postprocess(text: str) -> str: +@TEXT_POSTPROCESSORS.register_module('charm-reason') +def charm_reason_postprocess(text: str) -> str: ans = text ans_line = ans.split('answer is ') if len(ans_line) != 1: @@ -27,14 +27,11 @@ def charm_rea_postprocess(text: str) -> str: @ICL_EVALUATORS.register_module() -class CharmReaEvaluator(BaseEvaluator): +class CharmReasonEvaluator(BaseEvaluator): def score(self, predictions, references): if len(predictions) != len(references): - return { - 'error': 'predictions and references have different ' - 'length' - } + return {'error': 'preds and refrs have different length'} details = [] cnt = 0 for pred, ref in zip(predictions, references): @@ -43,9 +40,7 @@ def score(self, predictions, references): cnt += 1 detail['correct'] = True details.append(detail) - score = cnt / len(predictions) * 100 - return {'score': score, 'details': details} diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index 145794059..35ea6d30a 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -208,7 +208,8 @@ def __init__(self, metric: str = 'MBPP') -> None: assert self.metric in ['MBPP', 'MBPPPlus'] def score(self, predictions, references): - assert len(predictions) == len(references) + if len(predictions) != len(references): + return {'error': 'preds and refrs have different length'} if self.metric == 'MBPP': result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} @@ -286,6 +287,8 @@ def _process_answer(self, text): r'BEGIN\s*(.*)\s*DONE', r'```python\s*(.*)\s*```', r'```\s*(.*)\s*```', + r'```python\s*(.*)\s*$', + r'```\s*(.*)\s*$', r'(.*)\s*```.*', r"\[BEGIN\]\s*'(.*)", r'\[BEGIN\](.*)', diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 4b35b160b..f41ff5d3d 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -18,7 +18,7 @@ from .hunyuan_api import Hunyuan # noqa: F401 from .intern_model import InternLM # noqa: F401 from .krgpt_api import KrGPT # noqa: F401 -from .lightllm_api import LightllmAPI # noqa: F401 +from .lightllm_api import LightllmAPI, LightllmChatAPI # noqa: F401 from .llama2 import Llama2, Llama2Chat # noqa: F401 from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 from .lmdeploy_tis import LmdeployTisModel # noqa: F401 diff --git a/opencompass/models/huggingface_above_v4_33.py b/opencompass/models/huggingface_above_v4_33.py index 1f12d84fc..ae13ae106 100644 --- a/opencompass/models/huggingface_above_v4_33.py +++ b/opencompass/models/huggingface_above_v4_33.py @@ -84,7 +84,6 @@ def _convert_chat_messages(inputs, merge_role=True): messages = merged_messages outputs.append(messages) - print(messages) return outputs diff --git a/opencompass/models/lightllm_api.py b/opencompass/models/lightllm_api.py index b0062525f..2ac047324 100644 --- a/opencompass/models/lightllm_api.py +++ b/opencompass/models/lightllm_api.py @@ -1,16 +1,20 @@ import json import re +import time from concurrent.futures import ThreadPoolExecutor -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import numpy as np import requests from opencompass.registry import MODELS from opencompass.utils.logging import get_logger +from opencompass.utils.prompt import PromptList from .base import BaseModel -from .base_api import TokenBucket +from .base_api import BaseAPIModel, TokenBucket + +PromptType = Union[PromptList, str] @MODELS.register_module() @@ -189,3 +193,158 @@ def get_token_len(self, prompt: str) -> int: chinese_count = sum(len(part) for part in chinese_parts) return english_count + chinese_count + + +class LightllmChatAPI(BaseAPIModel): + """Model wrapper around YiAPI. + + Documentation: + + Args: + path (str): The name of YiAPI model. + e.g. `moonshot-v1-32k` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + self.url = url + self.model = path + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + data = {'messages': messages} + + max_num_retries = 0 + while max_num_retries < self.retry: + self.acquire() + try: + raw_response = requests.request('POST', + url=self.url, + json=data) + except Exception as err: + print('Request Error:{}'.format(err)) + time.sleep(2) + continue + + try: + response = raw_response.json() + except Exception as err: + print('Response Error:{}'.format(err)) + response = None + self.release() + + if response is None: + print('Connection error, reconnect.') + # if connect error, frequent requests will casuse + # continuous unstable network, therefore wait here + # to slow down the request + self.wait() + continue + + if raw_response.status_code == 200: + # msg = json.load(response.text) + # response + msg = response['choices'][0]['message']['content'] + self.logger.debug(f'Generated: {msg}') + return msg + + if raw_response.status_code == 401: + print('请求被拒绝 api_key错误') + continue + elif raw_response.status_code == 400: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The request was rejected because high risk' + return msg + elif raw_response.status_code == 429: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(5) + continue + else: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(1) + + max_num_retries += 1 + + raise RuntimeError(raw_response) diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index d6d7e0706..5497f4c2a 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -50,6 +50,9 @@ class OpenAI(BaseAPIModel): temperature (float, optional): What sampling temperature to use. If not None, will override the temperature in the `generate()` call. Defaults to None. + tokenizer_path (str, optional): The path to the tokenizer. Use path if + 'tokenizer_path' is None, otherwise use the 'tokenizer_path'. + Defaults to None. """ is_api: bool = True @@ -67,7 +70,8 @@ def __init__(self, mode: str = 'none', logprobs: Optional[bool] = False, top_logprobs: Optional[int] = None, - temperature: Optional[float] = None): + temperature: Optional[float] = None, + tokenizer_path: Optional[str] = None): super().__init__(path=path, max_seq_len=max_seq_len, @@ -82,6 +86,7 @@ def __init__(self, self.mode = mode self.logprobs = logprobs self.top_logprobs = top_logprobs + self.tokenizer_path = tokenizer_path if isinstance(key, str): if key == 'ENV': @@ -287,7 +292,8 @@ def get_token_len(self, prompt: str) -> int: Returns: int: Length of the input tokens """ - enc = self.tiktoken.encoding_for_model(self.path) + enc = self.tiktoken.encoding_for_model(self.path + or self.tokenizer_path) return len(enc.encode(prompt)) def bin_trim(self, prompt: str, num_token: int) -> str: @@ -333,140 +339,3 @@ def bin_trim(self, prompt: str, num_token: int) -> str: elif self.mode == 'rear': prompt = sep.join(words[:l]) return prompt - - -class OpenAIAllesAPIN(OpenAI): - """Model wrapper around OpenAI-AllesAPIN. - - Args: - path (str): The name of OpenAI's model. - url (str): URL to AllesAPIN. - key (str): AllesAPIN key. - query_per_second (int): The maximum queries allowed per second - between two consecutive calls of the API. Defaults to 1. - max_seq_len (int): Unused here. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - retry (int): Number of retires if the API call fails. Defaults to 2. - """ - - is_api: bool = True - - def __init__(self, - path: str, - url: str, - key: str, - temperature: float = 1.0, - query_per_second: int = 1, - rpm_verbose: bool = False, - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - retry: int = 2): - super().__init__(path=path, - max_seq_len=max_seq_len, - query_per_second=query_per_second, - rpm_verbose=rpm_verbose, - meta_template=meta_template, - retry=retry) - self.url = url - self.temperature = temperature - self.headers = { - 'alles-apin-token': key, - 'content-type': 'application/json', - } - - def _generate(self, input: PromptType, max_out_len: int, - temperature: float) -> str: - """Generate results given an input. - - Args: - inputs (PromptType): A string or PromptDict. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - temperature (float): What sampling temperature to use, - between 0 and 2. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more - focused and deterministic. - - Returns: - str: The generated string. - """ - assert isinstance(input, (str, PromptList)) - - if isinstance(input, str): - messages = [{'role': 'user', 'content': input}] - else: - messages = [] - for item in input: - msg = {'content': item['prompt']} - if item['role'] == 'HUMAN': - msg['role'] = 'user' - elif item['role'] == 'BOT': - msg['role'] = 'assistant' - elif item['role'] == 'SYSTEM': - msg['role'] = 'system' - messages.append(msg) - - # model can be response with user and system - # when it comes with agent involved. - assert msg['role'] in ['user', 'system'] - - data = { - 'model': self.path, - 'messages': messages, - 'temperature': temperature - } - for _ in range(self.retry): - self.wait() - try: - raw_response = requests.post(self.url, - headers=self.headers, - data=json.dumps(data)) - except requests.ConnectionError: - self.logger.error('Request error, got', - str(raw_response.content)) - time.sleep(1) - continue - try: - response = raw_response.json() - except requests.JSONDecodeError: - self.logger.error('JsonDecode error, got', - str(raw_response.content)) - time.sleep(1) - continue - if raw_response.status_code == 200 and response[ - 'msgCode'] == '10000': - data = response['data'] - choices = data['choices'] - if choices is None: - self.logger.error(data) - else: - return choices[0]['message']['content'].strip() - try: - match = re.match(r'Error code: \d+ - (.*)', response['data']) - err = eval(match.group(1))['error'] - if err['code'] == 'content_filter' and err['status'] == 400: - return err['message'] - except Exception: - pass - self.logger.error(response['msg']) - self.logger.error(response) - time.sleep(1) - - raise RuntimeError('API call failed.') - - def get_token_len(self, prompt: str) -> int: - """Get lengths of the tokenized string. Only English and Chinese - characters are counted for now. Users are encouraged to override this - method if more accurate length is needed. - - Args: - prompt (str): Input string. - - Returns: - int: Length of the input tokens - """ - enc = self.tiktoken.encoding_for_model(self.path) - return len(enc.encode(prompt)) diff --git a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py index 1775ba122..b08a6fabb 100644 --- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py @@ -145,6 +145,8 @@ def save_predictions(self, predictions): def save_prompt_and_ppl(self, label, input, prompt, ppl, idx): if str(idx) not in self.results_dict.keys(): self.results_dict[str(idx)] = {} + if 'origin_prompt' not in self.results_dict[str(idx)]: + self.results_dict[str(idx)]['origin_prompt'] = input if 'label: ' + str(label) not in self.results_dict[str(idx)].keys(): self.results_dict[str(idx)]['label: ' + str(label)] = {} self.results_dict[str(idx)]['label: ' + diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 0590d223f..e568a965b 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -161,6 +161,9 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): shell_cmd += 'umask 0000; ' shell_cmd += '{task_cmd}' + # set priority to 1 as default + task_priority = self.aliyun_cfg.get('priority', 1) + tmpl = ( 'dlc submit pytorchjob' f" --command '{shell_cmd}'" @@ -168,6 +171,7 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None): f" --config {self.aliyun_cfg['dlc_config_path']}" f" --workspace_id {self.aliyun_cfg['workspace_id']}" f" --resource_id {self.aliyun_cfg['resource_id']}" + f' --priority {task_priority}' ' --workers 1' f' --worker_cpu {max(num_gpus * 8, 12)}' f' --worker_gpu {num_gpus}' diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index f16b208db..8eeca3288 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -226,7 +226,7 @@ def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics, return raw_results, parsed_results, dataset_metrics, dataset_eval_mode - def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None): + def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None, skip_all_slash=False): dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs] prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs} @@ -257,14 +257,16 @@ def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, requ table.append(header) for dataset_abbr, metric in summarizer_dataset_abbrs: if dataset_abbr not in dataset_metrics: - table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + if not skip_all_slash: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) continue if metric is None: metric = dataset_metrics[dataset_abbr][0] elif metric in dataset_metrics[dataset_abbr]: pass else: - table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) + if not skip_all_slash: + table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs)) continue row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')] diff --git a/opencompass/summarizers/multi_faceted.py b/opencompass/summarizers/multi_faceted.py index 27848ba31..a272947ef 100644 --- a/opencompass/summarizers/multi_faceted.py +++ b/opencompass/summarizers/multi_faceted.py @@ -1,11 +1,9 @@ # flake8: noqa # yapf: disable -import functools -import getpass -import math +import json import os from datetime import datetime -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional import tabulate from mmengine import ConfigDict @@ -33,7 +31,9 @@ def summarize(self, output_path: str = None, time_str: str = datetime.now().strf profile_dataset_abbrs = dataset_abbrs_item['dataset_abbrs'] # format table - table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs) + table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs, skip_all_slash=True) + if len(table) == 1: + continue # output to screen print(tabulate.tabulate(table, headers='firstrow', floatfmt='.2f')) diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index a11bb4cd1..8d2494d8d 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -214,6 +214,8 @@ def postprocess(sample): preds['references'] = (test_set[self.output_column] if self.output_column else None) preds['test_set'] = test_set + if 'origin_prompt' not in preds: + preds['origin_prompt'] = [None for _ in range(len(pred_strs))] preds = { k: preds[k] for k in signature(icl_evaluator.score).parameters diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 6d65e4546..9d384297f 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -258,7 +258,7 @@ def change_accelerator(models, accelerator): path=model['path'], model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']), max_out_len=model['max_out_len'], - batch_size=32768, + batch_size=16, run_cfg=model['run_cfg'], stop_words=model.get('stop_words', []), ) @@ -272,7 +272,7 @@ def change_accelerator(models, accelerator): gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), max_seq_len=model.get('max_seq_len', 2048), max_out_len=model['max_out_len'], - batch_size=32768, + batch_size=16, run_cfg=model['run_cfg'], stop_words=model.get('stop_words', []), ) diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index b1b0c1730..7baddbb4c 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -63,15 +63,15 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: # yapf: disable # flake8: noqa: W605 patterns = [ - f'答案是?\s?([{options}])', - f'答案是?\s?:([{options}])', - f'答案是?\s?:([{options}])', - f'答案应该?是\s?([{options}])', - f'答案应该?选\s?([{options}])', - f'答案为\s?([{options}])', - f'答案选\s?([{options}])', - f'选择?\s?([{options}])', - f'故选?\s?([{options}])' + f'答案是?\s*([{options}])', + f'答案是?\s*:\s*([{options}])', + f'答案是?\s*:\s*([{options}])', + f'答案应该?是\s*([{options}])', + f'答案应该?选\s*([{options}])', + f'答案为\s*([{options}])', + f'答案选\s*([{options}])', + f'选择?\s*([{options}])', + f'故选?\s*([{options}])' f'只有选?项?\s?([{options}])\s?是?对', f'只有选?项?\s?([{options}])\s?是?错', f'只有选?项?\s?([{options}])\s?不?正确', diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 8452052e3..b1c10ae81 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -22,6 +22,7 @@ OpenCC opencv-python-headless pandas<2.0.0 prettytable +protobuf pyext pypinyin python-Levenshtein From 808582d9522af49eee91af254ceb19252dc94848 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Wed, 29 May 2024 10:14:08 +0800 Subject: [PATCH 03/12] Fix VLLM argument error (#1207) --- configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py | 2 +- configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py | 2 +- configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py | 2 +- configs/models/others/vllm_orionstar_14b_longchat.py | 2 +- configs/models/qwen/vllm_qwen1_5_14b_chat.py | 2 +- configs/models/qwen/vllm_qwen1_5_72b_chat.py | 2 +- configs/models/qwen/vllm_qwen_14b_chat.py | 2 +- configs/models/qwen/vllm_qwen_72b_chat.py | 2 +- configs/models/vicuna/vllm_vicuna_13b_v15_16k.py | 2 +- configs/models/vicuna/vllm_vicuna_7b_v15_16k.py | 2 +- configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py | 2 +- configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py | 2 +- configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py | 2 +- configs/models/zephyr/vllm_zephyr_7b_beta.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py index 5f4eee671..db4094f62 100644 --- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py +++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_1.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py index ca202d1b9..c1dd32ba2 100644 --- a/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py +++ b/configs/models/mistral/vllm_mistral_7b_instruct_v0_2.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py index 03975355d..2bbc0f0d6 100644 --- a/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py +++ b/configs/models/mistral/vllm_mixtral_8x7b_instruct_v0_1.py @@ -20,7 +20,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=2, num_procs=1), ) ] diff --git a/configs/models/others/vllm_orionstar_14b_longchat.py b/configs/models/others/vllm_orionstar_14b_longchat.py index 58cdea8d3..2c90f7a3d 100644 --- a/configs/models/others/vllm_orionstar_14b_longchat.py +++ b/configs/models/others/vllm_orionstar_14b_longchat.py @@ -21,6 +21,6 @@ max_seq_len=4096, batch_size=32, run_cfg=dict(num_gpus=4, num_procs=1), - end_str='<|endoftext|>', + stop_words=['<|endoftext|>'], ) ] diff --git a/configs/models/qwen/vllm_qwen1_5_14b_chat.py b/configs/models/qwen/vllm_qwen1_5_14b_chat.py index e4736857b..b5e97d3d9 100644 --- a/configs/models/qwen/vllm_qwen1_5_14b_chat.py +++ b/configs/models/qwen/vllm_qwen1_5_14b_chat.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='<|im_end|>', + stop_words=['<|im_end|>'], run_cfg=dict(num_gpus=2, num_procs=1), ) ] diff --git a/configs/models/qwen/vllm_qwen1_5_72b_chat.py b/configs/models/qwen/vllm_qwen1_5_72b_chat.py index 921e220a5..2cb2ca534 100644 --- a/configs/models/qwen/vllm_qwen1_5_72b_chat.py +++ b/configs/models/qwen/vllm_qwen1_5_72b_chat.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='<|im_end|>', + stop_words=['<|im_end|>'], run_cfg=dict(num_gpus=4, num_procs=1), ) ] diff --git a/configs/models/qwen/vllm_qwen_14b_chat.py b/configs/models/qwen/vllm_qwen_14b_chat.py index 830010ef4..53e221aa2 100644 --- a/configs/models/qwen/vllm_qwen_14b_chat.py +++ b/configs/models/qwen/vllm_qwen_14b_chat.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='<|im_end|>', + stop_words=['<|im_end|>'], run_cfg=dict(num_gpus=4, num_procs=1), ) ] diff --git a/configs/models/qwen/vllm_qwen_72b_chat.py b/configs/models/qwen/vllm_qwen_72b_chat.py index cf2caa2c8..fb7279139 100644 --- a/configs/models/qwen/vllm_qwen_72b_chat.py +++ b/configs/models/qwen/vllm_qwen_72b_chat.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='<|im_end|>', + stop_words=['<|im_end|>'], run_cfg=dict(num_gpus=4, num_procs=1), ) ] diff --git a/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py b/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py index b15b49215..e26b3ff18 100644 --- a/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py +++ b/configs/models/vicuna/vllm_vicuna_13b_v15_16k.py @@ -17,7 +17,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=2, num_procs=1), ) ] diff --git a/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py b/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py index b45646f0b..ca261f132 100644 --- a/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py +++ b/configs/models/vicuna/vllm_vicuna_7b_v15_16k.py @@ -17,7 +17,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py index 5df42752f..2e7fafeb6 100644 --- a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py +++ b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py @@ -18,7 +18,7 @@ max_seq_len=2048, batch_size=1, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py b/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py index 8c25d0101..a722593a6 100644 --- a/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py +++ b/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py @@ -19,7 +19,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=4, num_procs=1), ) ] diff --git a/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py b/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py index b7fe7f02c..60b33c865 100644 --- a/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py +++ b/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py @@ -18,7 +18,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/configs/models/zephyr/vllm_zephyr_7b_beta.py b/configs/models/zephyr/vllm_zephyr_7b_beta.py index 956ac073e..499b58b6d 100644 --- a/configs/models/zephyr/vllm_zephyr_7b_beta.py +++ b/configs/models/zephyr/vllm_zephyr_7b_beta.py @@ -17,7 +17,7 @@ max_seq_len=2048, batch_size=32, generation_kwargs=dict(temperature=0), - end_str='', + stop_words=[''], run_cfg=dict(num_gpus=1, num_procs=1), ) ] From d656e818f8f0bdc4ceff91567bde13dd882e02d8 Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Wed, 29 May 2024 16:30:10 +0800 Subject: [PATCH 04/12] [Docs] Remove --no-batch-padding and Use --hf-num-gpus (#1205) * [Docs] Remove --no-batch-padding and Use -hf-num-gpus * update --- docs/en/get_started/faq.md | 17 ++++------------- docs/en/get_started/quick_start.md | 4 ++-- docs/en/user_guides/experimentation.md | 13 ++----------- docs/zh_cn/get_started/faq.md | 17 ++++------------- docs/zh_cn/get_started/quick_start.md | 4 ++-- docs/zh_cn/user_guides/experimentation.md | 13 ++----------- opencompass/cli/main.py | 9 ++++++++- opencompass/utils/run.py | 2 +- 8 files changed, 25 insertions(+), 54 deletions(-) diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md index edaa795a3..570605c72 100644 --- a/docs/en/get_started/faq.md +++ b/docs/en/get_started/faq.md @@ -37,9 +37,9 @@ This is a complex issue that needs to be explained from both the supply and dema The supply side refers to how many tasks are being run. A task is a combination of a model and a dataset, and it primarily depends on how many models and datasets need to be tested. Additionally, since OpenCompass splits a larger task into multiple smaller tasks, the number of data entries per sub-task (`--max-partition-size`) also affects the number of tasks. (The `--max-partition-size` is proportional to the actual number of data entries, but the relationship is not 1:1). -The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--num-gpus` to specify how many GPUs each instance uses. Note that `--num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs. +The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--hf-num-gpus` to specify how many GPUs each instance uses. Note that `--hf-num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs. -In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--num-gpus` to the minimum value that meets the demand and do not adjust it further.** +In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--hf-num-gpus` to the minimum value that meets the demand and do not adjust it further.** ### How do I control the number of GPUs that OpenCompass occupies? @@ -114,17 +114,8 @@ Hence, if users find that the number of tasks greatly exceeds the available GPUs ### How to use the downloaded huggingface models? -If you have already download the checkpoints of the model, you can specify the local path of the model and tokenizer, and add `trust_remote_code=True` for `--model-kwargs` and `--tokenizer-kwargs`. For example +If you have already download the checkpoints of the model, you can specify the local path of the model. For example ```bash -python run.py --datasets siqa_gen winograd_ppl \ ---hf-path /path/to/model \ # HuggingFace 模型地址 ---tokenizer-path /path/to/model \ # HuggingFace 模型地址 ---model-kwargs device_map='auto' trust_remote_code=True \ # 构造 model 的参数 ---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False trust_remote_code=True \ # 构造 tokenizer 的参数 ---max-out-len 100 \ # 模型能接受的最大序列长度 ---max-seq-len 2048 \ # 最长生成 token 数 ---batch-size 8 \ # 批次大小 ---no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失 ---num-gpus 1 # 所需 gpu 数 +python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model ``` diff --git a/docs/en/get_started/quick_start.md b/docs/en/get_started/quick_start.md index caba742c9..9e89e4df7 100644 --- a/docs/en/get_started/quick_start.md +++ b/docs/en/get_started/quick_start.md @@ -87,7 +87,7 @@ python run.py --datasets siqa_gen winograd_ppl \ Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once. ```{caution} -`--num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus) +`--hf-num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus) ``` :::{dropdown} More detailed example @@ -103,7 +103,7 @@ python run.py --datasets siqa_gen winograd_ppl \ --max-out-len 100 \ # Maximum number of tokens to generate --min-out-len 100 \ # Minimum number of tokens to generate --batch-size 64 \ # Batch size ---num-gpus 1 # Number of GPUs required to run the model +--hf-num-gpus 1 # Number of GPUs required to run the model ``` ```{seealso} For all HuggingFace related parameters supported by `run.py`, please read [Launching Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task). diff --git a/docs/en/user_guides/experimentation.md b/docs/en/user_guides/experimentation.md index 96b3cca30..0f5575a0f 100644 --- a/docs/en/user_guides/experimentation.md +++ b/docs/en/user_guides/experimentation.md @@ -25,15 +25,7 @@ Task Configuration (`$EXP`): - For HuggingFace related models, users can also define a model quickly in the command line through HuggingFace parameters and then specify datasets using `--datasets DATASET1 DATASET2 ...`. ```bash - python run.py --datasets siqa_gen winograd_ppl \ - --hf-path huggyllama/llama-7b \ # HuggingFace model path - --model-kwargs device_map='auto' \ # Parameters for constructing the model - --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Parameters for constructing the tokenizer - --max-out-len 100 \ # Maximum sequence length the model can accept - --max-seq-len 2048 \ # Maximum generated token count - --batch-size 8 \ # Batch size - --no-batch-padding \ # Disable batch padding and infer through a for loop to avoid accuracy loss - --num-gpus 1 # Number of minimum required GPUs for this model + python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path huggyllama/llama-7b ``` Complete HuggingFace parameter descriptions: @@ -45,9 +37,8 @@ Task Configuration (`$EXP`): - `--tokenizer-kwargs`: Parameters for constructing the tokenizer - `--max-out-len`: Maximum generated token count - `--max-seq-len`: Maximum sequence length the model can accept - - `--no-batch-padding`: Disable batch padding and infer through a for loop to avoid accuracy loss - `--batch-size`: Batch size - - `--num-gpus`: Number of GPUs required to run the model. Please note that this parameter is only used to determine the number of GPUs required to run the model, and does not affect the actual number of GPUs used for the task. Refer to [Efficient Evaluation](./evaluation.md) for more details. + - `--hf-num-gpus`: Number of GPUs required to run the model. Please note that this parameter is only used to determine the number of GPUs required to run the model, and does not affect the actual number of GPUs used for the task. Refer to [Efficient Evaluation](./evaluation.md) for more details. Starting Methods: diff --git a/docs/zh_cn/get_started/faq.md b/docs/zh_cn/get_started/faq.md index 5cc6cfbb3..44e0f9ea5 100644 --- a/docs/zh_cn/get_started/faq.md +++ b/docs/zh_cn/get_started/faq.md @@ -37,9 +37,9 @@ OpenCompass 使用称为 task (任务) 的单位处理评估请求。每个任 供给侧就是运行多少任务。任务是模型和数据集的组合,它首先取决于要测多少模型和多少数据集。另外由于 OpenCompass 会将一个较大的任务拆分成多个小任务,因此每个子任务有多少条数据 `--max-partition-size` 也会影响任务的数量。(`--max-partition-size` 与真实数据条目成正比,但并不是 1:1 的关系)。 -需求侧就是有多少 worker 在运行。由于 OpenCompass 会同时实例化多个模型去进行推理,因此我们用 `--num-gpus` 来指定每个实例使用多少 GPU。注意 `--num-gpus` 是一个 HuggingFace 模型专用的参数,非 HuggingFace 模型设置该参数是不会起作用的。同时我们使用 `--max-num-workers` 去表示最多有多少个实例在运行。最后由于 GPU 显存、负载不充分等问题,OpenCompass 也支持在同一个 GPU 上运行多个实例,这个参数是 `--max-num-workers-per-gpu`。因此可以笼统地认为,我们总共会使用 `--num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` 个 GPU。 +需求侧就是有多少 worker 在运行。由于 OpenCompass 会同时实例化多个模型去进行推理,因此我们用 `--hf-num-gpus` 来指定每个实例使用多少 GPU。注意 `--hf-num-gpus` 是一个 HuggingFace 模型专用的参数,非 HuggingFace 模型设置该参数是不会起作用的。同时我们使用 `--max-num-workers` 去表示最多有多少个实例在运行。最后由于 GPU 显存、负载不充分等问题,OpenCompass 也支持在同一个 GPU 上运行多个实例,这个参数是 `--max-num-workers-per-gpu`。因此可以笼统地认为,我们总共会使用 `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` 个 GPU。 -综上,当任务运行较慢,GPU 负载不高的时候,我们首先需要检查供给是否充足,如果不充足,可以考虑调小 `--max-partition-size` 来将任务拆分地更细;其次需要检查需求是否充足,如果不充足,可以考虑增大 `--max-num-workers` 和 `--max-num-workers-per-gpu`。一般来说,**我们会将 `--num-gpus` 设定为最小的满足需求的值,并不会再进行调整**。 +综上,当任务运行较慢,GPU 负载不高的时候,我们首先需要检查供给是否充足,如果不充足,可以考虑调小 `--max-partition-size` 来将任务拆分地更细;其次需要检查需求是否充足,如果不充足,可以考虑增大 `--max-num-workers` 和 `--max-num-workers-per-gpu`。一般来说,**我们会将 `--hf-num-gpus` 设定为最小的满足需求的值,并不会再进行调整**。 ### 我如何控制 OpenCompass 占用的 GPU 数量? @@ -114,17 +114,8 @@ OpenCompass 中的每个任务代表等待评估的特定模型和数据集部 ### 如何使用本地已下好的 Huggingface 模型? -如果您已经提前下载好 Huggingface 的模型文件,请手动指定模型路径,并在`--model-kwargs` 和 `--tokenizer-kwargs`中添加 `trust_remote_code=True`. 示例如下 +如果您已经提前下载好 Huggingface 的模型文件,请手动指定模型路径. 示例如下 ```bash -python run.py --datasets siqa_gen winograd_ppl \ ---hf-path /path/to/model \ # HuggingFace 模型地址 ---tokenizer-path /path/to/model \ # HuggingFace 模型地址 ---model-kwargs device_map='auto' trust_remote_code=True \ # 构造 model 的参数 ---tokenizer-kwargs padding_side='left' truncation='left' use_fast=False trust_remote_code=True \ # 构造 tokenizer 的参数 ---max-out-len 100 \ # 模型能接受的最大序列长度 ---max-seq-len 2048 \ # 最长生成 token 数 ---batch-size 8 \ # 批次大小 ---no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失 ---num-gpus 1 # 所需 gpu 数 +python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model ``` diff --git a/docs/zh_cn/get_started/quick_start.md b/docs/zh_cn/get_started/quick_start.md index 2e092b160..a624552d6 100644 --- a/docs/zh_cn/get_started/quick_start.md +++ b/docs/zh_cn/get_started/quick_start.md @@ -86,7 +86,7 @@ python run.py --datasets siqa_gen winograd_ppl \ 请注意,通过这种方式,OpenCompass 一次只评估一个模型,而其他方式可以一次评估多个模型。 ```{caution} -`--num-gpus` 不代表实际用于评估的 GPU 数量,而是该模型所需的最少 GPU 数量。[更多](faq.md#opencompass-如何分配-gpu) +`--hf-num-gpus` 不代表实际用于评估的 GPU 数量,而是该模型所需的最少 GPU 数量。[更多](faq.md#opencompass-如何分配-gpu) ``` @@ -104,7 +104,7 @@ python run.py --datasets siqa_gen winograd_ppl \ --max-out-len 100 \ # 生成的最大 token 数 --min-out-len 100 \ # 生成的最小 token 数 --batch-size 64 \ # 批量大小 ---num-gpus 1 # 运行模型所需的 GPU 数量 +--hf-num-gpus 1 # 运行模型所需的 GPU 数量 ``` ```{seealso} 有关 `run.py` 支持的所有与 HuggingFace 相关的参数,请阅读 [评测任务发起](../user_guides/experimentation.md#评测任务发起) diff --git a/docs/zh_cn/user_guides/experimentation.md b/docs/zh_cn/user_guides/experimentation.md index 5b781f27b..f7b234f75 100644 --- a/docs/zh_cn/user_guides/experimentation.md +++ b/docs/zh_cn/user_guides/experimentation.md @@ -25,15 +25,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb - 对于 HuggingFace 相关模型,用户也可以通过 HuggingFace 参数快速在命令行中定义一个模型,再通过 `--datasets DATASET1 DATASET2 ...` 定义数据集。 ```bash - python run.py --datasets siqa_gen winograd_ppl \ - --hf-path huggyllama/llama-7b \ # HuggingFace 模型地址 - --model-kwargs device_map='auto' \ # 构造 model 的参数 - --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # 构造 tokenizer 的参数 - --max-out-len 100 \ # 模型能接受的最大序列长度 - --max-seq-len 2048 \ # 最长生成 token 数 - --batch-size 8 \ # 批次大小 - --no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失 - --num-gpus 1 # 所需 gpu 数 + python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path huggyllama/llama-7b ``` HuggingFace 全量参数介绍如下: @@ -45,9 +37,8 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb - `--tokenizer-kwargs`: 构造 tokenizer 的参数 - `--max-out-len`: 最长生成 token 数 - `--max-seq-len`: 模型能接受的最大序列长度 - - `--no-batch-padding`: 不打开 batch padding,通过 for loop 推理,避免精度损失 - `--batch-size`: 批次大小 - - `--num-gpus`: 运行模型所需的gpu数 + - `--hf-num-gpus`: 运行模型所需的gpu数 启动方式: diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index 7defe785d..f0aaa5f43 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -186,7 +186,8 @@ def parse_hf_args(hf_parser): hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model') hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model') hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model') - hf_parser.add_argument('--num-gpus', type=int, default=1, help='The number of GPUs for **the HuggingFace model passed via cli**') + hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead') + hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli') hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model') hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model') @@ -205,6 +206,12 @@ def parse_custom_dataset_args(custom_dataset_parser): def main(): args = parse_args() + + if args.num_gpus is not None: + raise ValueError('The `--num-gpus` argument is deprecated, please use ' + '`--hf-num-gpus` to describe number of gpus used for ' + 'the HuggingFace model instead.') + if args.dry_run: args.debug = True # initialize logger diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 9d384297f..b584795f0 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -151,7 +151,7 @@ def get_config_from_arg(args) -> Config: batch_size=args.batch_size, pad_token_id=args.pad_token_id, stop_words=args.stop_words, - run_cfg=dict(num_gpus=args.num_gpus)) + run_cfg=dict(num_gpus=args.hf_num_gpus)) logger.debug(f'Using model: {model}') models.append(model) # set infer accelerator if needed From 0b50112dc19b5b06e32e41b6d7a3ba597ea516e8 Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Thu, 30 May 2024 00:03:22 +0800 Subject: [PATCH 05/12] [Fix] Rollback opt model configs (#1213) --- configs/models/opt/hf_opt_125m.py | 20 ++++++++++---------- configs/models/opt/hf_opt_350m.py | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/configs/models/opt/hf_opt_125m.py b/configs/models/opt/hf_opt_125m.py index ec0c68d36..b813d28e6 100644 --- a/configs/models/opt/hf_opt_125m.py +++ b/configs/models/opt/hf_opt_125m.py @@ -1,12 +1,12 @@ from opencompass.models import HuggingFaceBaseModel -models = [ - dict( - type=HuggingFaceBaseModel, - abbr='opt-125m-hf', - path='facebook/opt-125m', - max_out_len=1024, - batch_size=64, - run_cfg=dict(num_gpus=1), - ) -] +opt125m = dict( + type=HuggingFaceBaseModel, + abbr='opt-125m-hf', + path='facebook/opt-125m', + max_out_len=1024, + batch_size=64, + run_cfg=dict(num_gpus=1), +) + +models = [opt125m] diff --git a/configs/models/opt/hf_opt_350m.py b/configs/models/opt/hf_opt_350m.py index 6a25db0a7..57aa609cb 100644 --- a/configs/models/opt/hf_opt_350m.py +++ b/configs/models/opt/hf_opt_350m.py @@ -1,12 +1,12 @@ from opencompass.models import HuggingFaceBaseModel -models = [ - dict( - type=HuggingFaceBaseModel, - abbr='opt-350m-hf', - path='facebook/opt-350m', - max_out_len=1024, - batch_size=32, - run_cfg=dict(num_gpus=1), - ) -] +opt350m = dict( + type=HuggingFaceBaseModel, + abbr='opt-350m-hf', + path='facebook/opt-350m', + max_out_len=1024, + batch_size=32, + run_cfg=dict(num_gpus=1), +) + +models = [opt350m] From d59189b87fe66a9dace16930f193742969e9fb84 Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Thu, 30 May 2024 00:06:39 +0800 Subject: [PATCH 06/12] [Doc] Update running command in README (#1206) --- configs/datasets/GaokaoBench/README.md | 5 ++ configs/datasets/IFEval/README.md | 31 +++++++ configs/datasets/TheoremQA/README.md | 5 ++ configs/datasets/bbh/README.md | 5 ++ configs/datasets/ceval/README.md | 5 ++ configs/datasets/gpqa/README.md | 5 ++ configs/datasets/gsm8k/README.md | 5 ++ configs/datasets/hellaswag/README.md | 5 ++ configs/datasets/humaneval/README.md | 5 ++ configs/datasets/math/README.md | 5 ++ configs/datasets/mbpp/README.md | 119 +++++++++++++------------ configs/datasets/mmlu/README.md | 5 ++ configs/datasets/nq/README.md | 5 ++ configs/datasets/race/README.md | 5 ++ configs/datasets/triviaqa/README.md | 5 ++ configs/datasets/winogrande/README.md | 5 ++ 16 files changed, 163 insertions(+), 57 deletions(-) create mode 100644 configs/datasets/IFEval/README.md diff --git a/configs/datasets/GaokaoBench/README.md b/configs/datasets/GaokaoBench/README.md index d05c091cc..71b0ffac4 100644 --- a/configs/datasets/GaokaoBench/README.md +++ b/configs/datasets/GaokaoBench/README.md @@ -1,5 +1,10 @@ # GaokaoBench +```bash +python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug +``` + ## Base Models | model | GaokaoBench | diff --git a/configs/datasets/IFEval/README.md b/configs/datasets/IFEval/README.md new file mode 100644 index 000000000..3753262c3 --- /dev/null +++ b/configs/datasets/IFEval/README.md @@ -0,0 +1,31 @@ +# IFEval + +```bash +python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug +``` + +## Chat Models + +| model | Prompt-level-strict-accuracy | Inst-level-strict-accuracy | Prompt-level-loose-accuracy | Inst-level-loose-accuracy | +|:-----------------------------:|-------------------------------:|-----------------------------:|------------------------------:|----------------------------:| +| qwen1.5-0.5b-chat-hf | 13.12 | 23.26 | 15.71 | 26.38 | +| qwen1.5-1.8b-chat-hf | 16.08 | 26.26 | 18.30 | 29.02 | +| qwen1.5-4b-chat-hf | 25.51 | 35.97 | 28.84 | 39.81 | +| qwen1.5-7b-chat-hf | 38.82 | 50.00 | 42.70 | 53.48 | +| qwen1.5-14b-chat-hf | 42.51 | 54.20 | 49.17 | 59.95 | +| qwen1.5-32b-chat-hf | 49.54 | 60.43 | 53.97 | 64.39 | +| qwen1.5-72b-chat-hf | 51.02 | 61.99 | 57.12 | 67.27 | +| qwen1.5-110b-chat-hf | 55.08 | 65.59 | 61.18 | 70.86 | +| internlm2-chat-1.8b-hf | 18.30 | 28.78 | 21.44 | 32.01 | +| internlm2-chat-1.8b-sft-hf | 18.67 | 31.18 | 19.78 | 32.85 | +| internlm2-chat-7b-hf | 34.75 | 46.28 | 40.48 | 51.44 | +| internlm2-chat-7b-sft-hf | 39.19 | 50.12 | 42.33 | 52.76 | +| internlm2-chat-20b-hf | 36.41 | 48.68 | 40.67 | 53.24 | +| internlm2-chat-20b-sft-hf | 44.55 | 55.64 | 46.77 | 58.03 | +| llama-3-8b-instruct-hf | 68.02 | 76.74 | 75.42 | 82.85 | +| llama-3-70b-instruct-hf | 78.00 | 84.65 | 84.29 | 89.21 | +| llama-3-8b-instruct-lmdeploy | 69.13 | 77.46 | 77.26 | 83.93 | +| llama-3-70b-instruct-lmdeploy | 75.97 | 82.97 | 83.18 | 88.37 | +| mistral-7b-instruct-v0.1-hf | 40.30 | 50.96 | 41.96 | 53.48 | +| mistral-7b-instruct-v0.2-hf | 49.17 | 60.43 | 51.94 | 64.03 | +| mixtral-8x7b-instruct-v0.1-hf | 50.09 | 60.67 | 55.64 | 65.83 | diff --git a/configs/datasets/TheoremQA/README.md b/configs/datasets/TheoremQA/README.md index d62dddada..ab5e57f11 100644 --- a/configs/datasets/TheoremQA/README.md +++ b/configs/datasets/TheoremQA/README.md @@ -1,5 +1,10 @@ # TheoremQA +```bash +python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +``` + ## Base Models | model | TheoremQA | diff --git a/configs/datasets/bbh/README.md b/configs/datasets/bbh/README.md index e0e35850d..34aee487a 100644 --- a/configs/datasets/bbh/README.md +++ b/configs/datasets/bbh/README.md @@ -1,5 +1,10 @@ # BBH +```bash +python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug +``` + ## Base Models | model | bbh | diff --git a/configs/datasets/ceval/README.md b/configs/datasets/ceval/README.md index 4b49f787f..6932b73bb 100644 --- a/configs/datasets/ceval/README.md +++ b/configs/datasets/ceval/README.md @@ -1,5 +1,10 @@ # C-Eval +```bash +python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug +python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug +``` + ## Base Models | model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other | diff --git a/configs/datasets/gpqa/README.md b/configs/datasets/gpqa/README.md index 05f4aeb9b..02560df51 100644 --- a/configs/datasets/gpqa/README.md +++ b/configs/datasets/gpqa/README.md @@ -1,5 +1,10 @@ # GPQA +```bash +python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug +``` + ## Base Models | model | GPQA_diamond | diff --git a/configs/datasets/gsm8k/README.md b/configs/datasets/gsm8k/README.md index c91910de6..e4b9f9f68 100644 --- a/configs/datasets/gsm8k/README.md +++ b/configs/datasets/gsm8k/README.md @@ -1,5 +1,10 @@ # GSM8K +```bash +python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug +``` + ## Base Models | model | gsm8k | diff --git a/configs/datasets/hellaswag/README.md b/configs/datasets/hellaswag/README.md index dd62abc7c..91a5e2261 100644 --- a/configs/datasets/hellaswag/README.md +++ b/configs/datasets/hellaswag/README.md @@ -1,5 +1,10 @@ # HellaSwag +```bash +python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug +``` + ## Base Models | model | hellaswag | diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md index 4637e8a0a..c0e1e157e 100644 --- a/configs/datasets/humaneval/README.md +++ b/configs/datasets/humaneval/README.md @@ -1,5 +1,10 @@ # HumanEval +```bash +python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug +``` + ## Base Models | model | pass@1 | diff --git a/configs/datasets/math/README.md b/configs/datasets/math/README.md index a11a39083..c498db344 100644 --- a/configs/datasets/math/README.md +++ b/configs/datasets/math/README.md @@ -1,5 +1,10 @@ # MATH +```bash +python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug +python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug +``` + ## Base Models | model | math | diff --git a/configs/datasets/mbpp/README.md b/configs/datasets/mbpp/README.md index 3d0d84830..2f2b61115 100644 --- a/configs/datasets/mbpp/README.md +++ b/configs/datasets/mbpp/README.md @@ -1,64 +1,69 @@ # MBPP +```bash +python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug +python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug +``` + ## Base Models -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | -|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| -| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | -| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | -| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | -| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | -| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | -| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | -| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | -| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | -| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | -| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | -| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | -| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | -| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | -| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | -| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | -| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | -| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | -| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | -| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | -| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | -| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | -| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | -| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | -| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | -| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | -| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | -| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | -| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | -| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | -| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | -| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | -| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:------------------------:|---------:|-------:|----------:|---------:|---------------:| +| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | +| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | +| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | +| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | +| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | +| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | +| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | +| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | +| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | +| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | +| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | +| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | +| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | +| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | +| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | +| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | +| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | +| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | +| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | +| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | +| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | +| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | +| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | +| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | +| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | +| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | +| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | +| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | +| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | +| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | +| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | +| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | ## Chat Models -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | -|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| -| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | -| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | -| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | -| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | -| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | -| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | -| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | -| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | -| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | -| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | -| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | -| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | -| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | -| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | -| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | -| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | -| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | -| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | -| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | -| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | -| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:| +| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | +| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | +| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | +| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | +| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | +| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | +| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | +| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | +| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | +| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | +| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | +| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | +| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | +| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | +| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | +| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | +| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | +| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | +| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | +| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | +| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | diff --git a/configs/datasets/mmlu/README.md b/configs/datasets/mmlu/README.md index b8e4a9afd..eaa2181e3 100644 --- a/configs/datasets/mmlu/README.md +++ b/configs/datasets/mmlu/README.md @@ -1,5 +1,10 @@ # MMLU +```bash +python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug +python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug +``` + ## Base Models | model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | diff --git a/configs/datasets/nq/README.md b/configs/datasets/nq/README.md index 3c795f2ff..0cf8e714b 100644 --- a/configs/datasets/nq/README.md +++ b/configs/datasets/nq/README.md @@ -1,5 +1,10 @@ # NQ +```bash +python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug +``` + ## Base Models | model | nq | diff --git a/configs/datasets/race/README.md b/configs/datasets/race/README.md index f1c55ba76..916bdb80b 100644 --- a/configs/datasets/race/README.md +++ b/configs/datasets/race/README.md @@ -1,5 +1,10 @@ # RACE +```bash +python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug +``` + ## Base Models | model | race-high | race-middle | diff --git a/configs/datasets/triviaqa/README.md b/configs/datasets/triviaqa/README.md index f5e155a95..c849ed120 100644 --- a/configs/datasets/triviaqa/README.md +++ b/configs/datasets/triviaqa/README.md @@ -1,5 +1,10 @@ # TriviaQA +```bash +python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug +``` + ## Base Models | model | triviaqa | diff --git a/configs/datasets/winogrande/README.md b/configs/datasets/winogrande/README.md index 816732682..48a5ba70a 100644 --- a/configs/datasets/winogrande/README.md +++ b/configs/datasets/winogrande/README.md @@ -1,5 +1,10 @@ # WinoGrande +```bash +python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug +``` + ## Base Models | model | winogrande | From a77b8a5cecb1d8dd31a500dec1ad34c6dd6ded9c Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Thu, 30 May 2024 00:21:58 +0800 Subject: [PATCH 07/12] [Sync] format (#1214) --- .../compassbench/compassbench_compare.py | 58 +++++ configs/eval_subjective_compassbench.py | 137 ++++++++++ configs/summarizers/groups/charm_reason.py | 8 +- opencompass/datasets/subjective/__init__.py | 1 + .../datasets/subjective/compassbench.py | 101 ++++++++ .../summarizers/subjective/__init__.py | 1 + .../summarizers/subjective/compassbench.py | 241 ++++++++++++++++++ opencompass/utils/prompt.py | 11 +- opencompass/utils/run.py | 12 +- 9 files changed, 561 insertions(+), 9 deletions(-) create mode 100644 configs/datasets/subjective/compassbench/compassbench_compare.py create mode 100644 configs/eval_subjective_compassbench.py create mode 100644 opencompass/datasets/subjective/compassbench.py create mode 100644 opencompass/summarizers/subjective/compassbench.py diff --git a/configs/datasets/subjective/compassbench/compassbench_compare.py b/configs/datasets/subjective/compassbench/compassbench_compare.py new file mode 100644 index 000000000..942eca7c5 --- /dev/null +++ b/configs/datasets/subjective/compassbench/compassbench_compare.py @@ -0,0 +1,58 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import CompassBenchDataset + +subjective_reader_cfg = dict( + input_columns=['question', 'judge_prompt'], + output_column='judge', + ) + +data_path ='data/subjective/compassbench' + +subjective_datasets = [] + +versions = ['CompassbenchV1'] + +for version_abbr in versions: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = '{judge_prompt}' + ), + ]), + ), + ), + pred_role='BOT', + ) + + subjective_datasets.append( + dict( + abbr=version_abbr, + type=CompassBenchDataset, + path=data_path, + name=version_abbr, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/eval_subjective_compassbench.py b/configs/eval_subjective_compassbench.py new file mode 100644 index 000000000..ebb1c4e55 --- /dev/null +++ b/configs/eval_subjective_compassbench.py @@ -0,0 +1,137 @@ +from os import getenv as gv +from opencompass.models import HuggingFaceCausalLM +from mmengine.config import read_base + +with read_base(): + from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import CompassBenchSummarizer + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +# -------------Inference Stage ---------------------------------------- + +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm2-chat-7b-hf', + path='internlm/internlm2-chat-7b', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + stop_words=['', '<|im_end|>'], + generation_kwargs=dict( + do_sample=True, + ), + ) +] + +datasets = [*subjective_datasets] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='reserved', + max_num_workers=256, + task=dict(type=OpenICLInferTask), + ), +) + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions + +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=2, + retry=20, + temperature=0, +)] + +judge_models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ), + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b2', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ), + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm102b3', + path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + stop_words=['', '<|im_end|>'], + ) +] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + strategy='split', + max_task_size=10000000, + mode='m2n', + infer_order='double', + base_models=[gpt4], + compare_models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), + #given_pred = [{'abbr':'gpt4-turbo', 'path':''}] +) + +work_dir = 'outputs/compassbench/' + +summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add') diff --git a/configs/summarizers/groups/charm_reason.py b/configs/summarizers/groups/charm_reason.py index 52edc8d2b..3d1f4c199 100644 --- a/configs/summarizers/groups/charm_reason.py +++ b/configs/summarizers/groups/charm_reason.py @@ -20,16 +20,16 @@ ] -charm_reaso_summary_groups = [] +charm_reason_summary_groups = [] for prompt in prompts: for region in regions: subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks] - charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets}) + charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets}) for prompt in prompts: subsets = ['charm-reason-' + region + '_' + prompt for region in regions] - charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets}) + charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets}) -charm_reaso_summary_groups.append( +charm_reason_summary_groups.append( {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']} ) diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 1c4de400c..439d6b2be 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -1,6 +1,7 @@ from .alignbench import AlignmentBenchDataset # noqa: F401, F403 from .arena_hard import ArenaHardDataset # noqa: F401, F403 from .compass_arena import CompassArenaDataset # noqa: F401, F403 +from .compassbench import CompassBenchDataset # noqa: F401, F403 from .corev2 import Corev2Dataset # noqa: F401, F403 from .creationbench import CreationBenchDataset # noqa: F401, F403 from .information_retrival import IRDataset # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/compassbench.py b/opencompass/datasets/subjective/compassbench.py new file mode 100644 index 000000000..bd6f6a1d9 --- /dev/null +++ b/opencompass/datasets/subjective/compassbench.py @@ -0,0 +1,101 @@ +# flake8: noqa +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET + +from ..base import BaseDataset + +base_prompt_zh = """请根据 用户问题 以及 相应的两个回答,判断哪一个回答更好。 +[用户问题] +{question} + +[回答1开始] +{prediction} +[回答1结束] + +[回答2开始] +{prediction2} +[回答2结束] + +根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择: +A. 回答1更好 +B. 回答2更好 +C. 回答1、2平局 + +如果你认为回答1更好,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[A]] + +如果你认为回答2更好,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[B]] + +如果你认为回答1、2打成平手,你的输出应形如: +评价1:回答1 xxx +评价2:回答2 xxx +选择:[[C]] +""" + +base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options: +A. Response 1 is better +B. Response 2 is better +C. Both responses are equal + +[user's question] +{question} + +[Response 1 Start] +{prediction} +[Response 1 End] + +[Response 2 Start] +{prediction2} +[Response 2 End] + +If you believe that Response 1 is better, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[A]] + +If you believe that Response 2 is better, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[B]] + +If you believe that both responses are equally good, your output should be formatted as follows: +Evaluation 1: Response 1 xxx +Evaluation 2: Response 2 xxx +Choice: [[C]] +""" + + +@LOAD_DATASET.register_module() +class CompassBenchDataset(BaseDataset): + + def load(self, path: str, name: str): + filename = osp.join(path, f'{name}.json') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for problem in json_data: + question = problem['question'] + lan = problem['language'] + others = problem['others'] + judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en + raw_data.append({ + 'question': question, + 'judge_prompt': judge_prompt, + 'judge': { + 'lan': lan, + 'level': others['level'], + 'category': problem['category'], + 'question': question + } + }) + dataset = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 54ed56ade..88de42f56 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -4,6 +4,7 @@ from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer from .compass_arena import CompassArenaSummarizer +from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer from .creationbench import CreationBenchSummarizer from .flames import FlamesSummarizer diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py new file mode 100644 index 000000000..7a34c54a9 --- /dev/null +++ b/opencompass/summarizers/subjective/compassbench.py @@ -0,0 +1,241 @@ +# flake8: noqa +# yapf: disable +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime +from itertools import product + +import mmengine +from mmengine import ConfigDict +from tabulate import tabulate + +from opencompass.partitioners.sub_naive import remove_duplicate_pairs +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + +from .utils import get_judgeanswer_and_reference, get_outdir + + +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + +def post_process_compass_arena(s): + if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s): + return result[0] + else: + return None + + +def check_position_bias(judged_answers, references, banned_choice=['C']): + """Check position bias for judgellm's judgement. + + Args: + judged_answers: The successfully extracted judgement. + references: The references contains original question, which is used to located the same question for different position judgement. + """ + position_bias_flag = 0 + position_bias_dict = {} + for judge, ref in zip(judged_answers, references): + question = ref['question'] + question_hash = hash(question) + if question_hash not in position_bias_dict: + position_bias_dict[question_hash] = { + 'question': question, + 'judge': judge + } + else: + first_judge = position_bias_dict[question_hash]['judge'] + if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice: + # If second choice is same with first choice, there has position bias. + position_bias_flag += 1 + return position_bias_flag + + +class CompassBenchSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, + config: ConfigDict, + judge_type='general', + check_pos_bias=True, + summary_type='single') -> None: + self.tasks = [] + self.cfg = config + self.base_models = self.cfg['eval']['partitioner']['base_models'] + self.compare_models = self.cfg['eval']['partitioner']['compare_models'] + self.judge_models = self.cfg.get('judge_models', None) + self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) + self.judge_type = judge_type + assert self.judge_type in ['general'] + self.judge_map = {'general': post_process_compass_arena} + self.judge_function = self.judge_map[self.judge_type] + self.check_pos_bias = check_pos_bias + self.summary_type = summary_type + + def get_score(self, time_str): + output_dir, results_folder = get_outdir(self.cfg, time_str) + model_combinations = list(product(self.base_models, self.compare_models)) + unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]]) + + if self.meta_judge_model is not None: + self.judge_models.append(self.meta_judge_model) + + scores = {} + + for idx, judge_model_cfg in enumerate(self.judge_models): + judge_model = model_abbr_from_cfg(judge_model_cfg) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + for model_pair in unique_combinations: + model1 = model_pair[0]['abbr'] + model2 = model_pair[1]['abbr'] + if idx == len(self.judge_models): + subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model + else: + subdir = model1 + '_' + model2 + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if not os.path.isdir(subdir_path): + print(subdir_path + ' is not exist! please check!') + continue + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + if self.check_pos_bias: + bias_num = check_position_bias(judged_answers, references) + else: + bias_num = 0 + win_model1 = defaultdict(float) + win_model2 = defaultdict(float) + categories = defaultdict(float) + difficulties = defaultdict(float) + model1 = references[0]['answer1'] + model2 = references[0]['answer2'] + for prediction, reference in zip(judged_answers, references): + categories[dataset_abbr] += 1 + categories[reference['category']] += 1 + difficulties[reference['level']] += 1 + + if prediction == 'A': + if reference['answer1'] == model1: + score_1, score_2 = 1, 0 + else: + score_1, score_2 = 0, 1 + elif prediction == 'B': + if reference['answer1'] == model1: + score_1, score_2 = 0, 1 + else: + score_1, score_2 = 1, 0 + elif prediction == 'C': + if self.summary_type == 'half_add': + score_1, score_2 = 0.5, 0.5 + else: + score_1, score_2 = 0, 0 + + win_model1[reference['category']] += score_1 + win_model1[dataset_abbr] += score_1 + win_model2[reference['category']] += score_2 + win_model2[dataset_abbr] += score_2 + for category in categories: + win_model1[category] = win_model1[category] / categories[category] * 100 + win_model1[category] = round(win_model1[category], 2) + win_model2[category] = win_model2[category] / categories[category] * 100 + win_model2[category] = round(win_model2[category], 2) + win_model1['position_bias'] = bias_num + win_model2['position_bias'] = bias_num + + if judge_model not in scores: + scores[judge_model] = {} + if dataset_abbr not in scores[judge_model]: + scores[judge_model][dataset_abbr] = {} + scores[judge_model][dataset_abbr][model2] = win_model2 + + return scores + + def summarize( + self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), + ): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + + + scores = self.get_score(time_str) + # scores['win_' + model1] = win_model1 + output_dir, results_folder = get_outdir(self.cfg, time_str) + + + for idx, judge_model in enumerate(self.judge_models): + judge_abbr = model_abbr_from_cfg(judge_model) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + one_column = list(scores[judge_abbr][dataset_abbr].values())[0] + row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']] + row_headers = [dataset_abbr, 'position_bias'] + row_headers + headers = [''] + summarizer_model_abbrs + table = [] + for row_header in row_headers: + row = [row_header] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') + else: + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') + + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) + + table = [] + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + headers = [''] + summarizer_model_abbrs + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + row = [dataset_abbr] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv') + else: + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv') + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) diff --git a/opencompass/utils/prompt.py b/opencompass/utils/prompt.py index 496eec157..d65f6a03a 100644 --- a/opencompass/utils/prompt.py +++ b/opencompass/utils/prompt.py @@ -2,6 +2,7 @@ import hashlib import json +import re from copy import deepcopy from typing import Dict, List, Union @@ -19,9 +20,15 @@ def safe_format(input_str: str, **kwargs) -> str: Returns: str: The formatted string. """ + segs = [input_str] for k, v in kwargs.items(): - input_str = input_str.replace(f'{{{k}}}', str(v)) - return input_str + regex = re.compile(f'(?<={{{k}}})(?={{{k}}})|({{{k}}})') + segs = [regex.split(seg) for seg in segs] + segs = sum(segs, []) + replace_dict = {f'{{{k}}}': str(v) for k, v in kwargs.items()} + segs = [replace_dict.get(seg, seg) for seg in segs] + output_str = ''.join(segs) + return output_str def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str: diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index b584795f0..0072060ca 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -86,8 +86,14 @@ def get_config_from_arg(args) -> Config: config['models'] = change_accelerator(config['models'], args.accelerator) if config.get('eval', {}).get('partitioner', {}).get('models') is not None: config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator) + if config.get('eval', {}).get('partitioner', {}).get('base_models') is not None: + config['eval']['partitioner']['base_models'] = change_accelerator(config['eval']['partitioner']['base_models'], args.accelerator) + if config.get('eval', {}).get('partitioner', {}).get('compare_models') is not None: + config['eval']['partitioner']['compare_models'] = change_accelerator(config['eval']['partitioner']['compare_models'], args.accelerator) if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None: config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator) + if config.get('judge_models', {}) is not None: + config['judge_models'] = change_accelerator(config['judge_models'], args.accelerator) return config # parse dataset args @@ -211,7 +217,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModel acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr=model['abbr'].replace('hf', 'lmdeploy') if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', + abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', path=model['path'], engine_config=dict(session_len=model['max_seq_len'], max_batch_size=model['batch_size'], @@ -254,7 +260,7 @@ def change_accelerator(models, accelerator): mod = VLLMwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-vllm', + abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm', path=model['path'], model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']), max_out_len=model['max_out_len'], @@ -266,7 +272,7 @@ def change_accelerator(models, accelerator): mod = TurboMindModelwithChatTemplate acc_model = dict( type=f'{mod.__module__}.{mod.__name__}', - abbr='-hf'.join(model['abbr'].split('-hf')[:-1]) + '-turbomind', + abbr=model['abbr'].replace('hf', 'turbomind') if '-hf' in model['abbr'] else model['abbr'] + '-turbomind', path=model['path'], engine_config=dict(max_batch_size=model.get('batch_size', 16), tp=model['run_cfg']['num_gpus']), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9), From 7c381e5be8f076e73e81b0c4c2ed9bfd72b9313a Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Fri, 31 May 2024 11:40:47 +0800 Subject: [PATCH 08/12] [Fix] fix summarizer (#1217) * fix summarizer * fix summarizer --- configs/eval_subjective_arena_hard.py | 4 +-- .../summarizers/subjective/arenahard.py | 33 +++++++++++++++---- opencompass/utils/run.py | 2 +- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/configs/eval_subjective_arena_hard.py b/configs/eval_subjective_arena_hard.py index 08db8b29f..4200fd04a 100644 --- a/configs/eval_subjective_arena_hard.py +++ b/configs/eval_subjective_arena_hard.py @@ -72,8 +72,8 @@ key='', meta_template=api_meta_template, query_per_second=1, - max_out_len=5120, - max_seq_len=9216, + max_out_len=4096, + max_seq_len=8192, batch_size=10, retry=10, temperature = 0, diff --git a/opencompass/summarizers/subjective/arenahard.py b/opencompass/summarizers/subjective/arenahard.py index 5958bed9f..d82c800fb 100644 --- a/opencompass/summarizers/subjective/arenahard.py +++ b/opencompass/summarizers/subjective/arenahard.py @@ -133,6 +133,27 @@ def get_win_rate_column(df, column, baseline='gpt4-0314'): return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2)) +def load_model_preds(filename): + root, ext = osp.splitext(filename) + partial_filename = root + '_0' + ext + if osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + pred_strs = [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + else: + filename = partial_filename + pred_strs = [] + i = 1 + while osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + filename = root + f'_{i}' + ext + i += 1 + pred_strs += [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + return pred_strs + def get_battles_from_judgment(dataset, subdir_path, post_process, WEIGHT=3): arena_hard_battles = pd.DataFrame() dataset_abbr = dataset_abbr_from_cfg(dataset) @@ -274,12 +295,12 @@ def get_score(self, time_str): if model == 'gpt4-0314': stats.at[i, 'avg_tokens'] = 423 else: - with open(os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json'), 'r') as f: - model_preds = json.load(f) - pred_length = 0 - for k, v in model_preds.items(): - pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(v['prediction'])) - pred_length /= len(model_preds) + file_name = os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json') + model_preds = load_model_preds(file_name) + pred_length = 0 + for model_pred in model_preds: + pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(model_pred, disallowed_special=())) + pred_length /= len(model_preds) stats.at[i, 'avg_tokens'] = pred_length stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist() stats.sort_values(by='model', inplace=True) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 0072060ca..5eaa0271d 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -92,7 +92,7 @@ def get_config_from_arg(args) -> Config: config['eval']['partitioner']['compare_models'] = change_accelerator(config['eval']['partitioner']['compare_models'], args.accelerator) if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None: config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator) - if config.get('judge_models', {}) is not None: + if config.get('judge_models') is not None: config['judge_models'] = change_accelerator(config['judge_models'], args.accelerator) return config From b272803d8af589245bdcc2279f86f2742262e2a3 Mon Sep 17 00:00:00 2001 From: mqy004 <134621827+mqy004@users.noreply.github.com> Date: Fri, 31 May 2024 13:23:33 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E8=A7=A3=E5=86=B3release=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E5=AE=89=E8=A3=85=E5=90=8E=E4=B8=8D=E8=83=BD=E5=AF=BC?= =?UTF-8?q?=E5=85=A5opencompass.cli.main=E7=9A=84=E9=97=AE=E9=A2=98=20(#12?= =?UTF-8?q?21)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create __init__.py * Create __init__.py * Create __init__.py * Create __init__.py * Create __init__.py * Create __init__.py * format --------- Co-authored-by: Leymore --- opencompass/cli/__init__.py | 0 opencompass/datasets/IFEval/__init__.py | 0 opencompass/datasets/lawbench/evaluation_functions/__init__.py | 1 + opencompass/datasets/lawbench/utils/__init__.py | 1 + opencompass/datasets/lawbench/utils/modules/__init__.py | 1 + opencompass/datasets/needlebench/__init__.py | 0 6 files changed, 3 insertions(+) create mode 100644 opencompass/cli/__init__.py create mode 100644 opencompass/datasets/IFEval/__init__.py create mode 100644 opencompass/datasets/lawbench/evaluation_functions/__init__.py create mode 100644 opencompass/datasets/lawbench/utils/__init__.py create mode 100644 opencompass/datasets/lawbench/utils/modules/__init__.py create mode 100644 opencompass/datasets/needlebench/__init__.py diff --git a/opencompass/cli/__init__.py b/opencompass/cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencompass/datasets/IFEval/__init__.py b/opencompass/datasets/IFEval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencompass/datasets/lawbench/evaluation_functions/__init__.py b/opencompass/datasets/lawbench/evaluation_functions/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/opencompass/datasets/lawbench/evaluation_functions/__init__.py @@ -0,0 +1 @@ + diff --git a/opencompass/datasets/lawbench/utils/__init__.py b/opencompass/datasets/lawbench/utils/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/opencompass/datasets/lawbench/utils/__init__.py @@ -0,0 +1 @@ + diff --git a/opencompass/datasets/lawbench/utils/modules/__init__.py b/opencompass/datasets/lawbench/utils/modules/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/opencompass/datasets/lawbench/utils/modules/__init__.py @@ -0,0 +1 @@ + diff --git a/opencompass/datasets/needlebench/__init__.py b/opencompass/datasets/needlebench/__init__.py new file mode 100644 index 000000000..e69de29bb From 02a0a4e857311b4f97edb9827ac43cb6aff8ce98 Mon Sep 17 00:00:00 2001 From: Xingyuan Bu Date: Mon, 3 Jun 2024 14:52:12 +0800 Subject: [PATCH 10/12] MT-Bench-101 (#1215) * add mt-bench-101 * add readme and requirements * add mt-bench-101 data * Update readme_mtbench101.md * update readme * update leaderboard * fix typo * Update readme_mtbench101.md * fit newest opencompass * update readme.md * mtbench101 to opencompass * mtbench101 to opencompass * for code review * for code review * for code review * hook * hook --------- Co-authored-by: liujie --- .pre-commit-config.yaml | 1 + .../subjective/multiround/mtbench101_judge.py | 62 ++++ configs/eval_subjective_mtbench101.py | 94 +++++ opencompass/datasets/subjective/__init__.py | 1 + opencompass/datasets/subjective/mtbench101.py | 327 ++++++++++++++++++ .../summarizers/subjective/__init__.py | 1 + .../summarizers/subjective/mtbench101.py | 142 ++++++++ 7 files changed, 628 insertions(+) create mode 100644 configs/datasets/subjective/multiround/mtbench101_judge.py create mode 100644 configs/eval_subjective_mtbench101.py create mode 100644 opencompass/datasets/subjective/mtbench101.py create mode 100644 opencompass/summarizers/subjective/mtbench101.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a94a1e53..f03049e73 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,7 @@ exclude: | opencompass/datasets/teval/| opencompass/datasets/NPHardEval/| opencompass/datasets/TheoremQA| + opencompass/datasets/subjective/mtbench101.py| docs/zh_cn/advanced_guides/compassbench_intro.md ) repos: diff --git a/configs/datasets/subjective/multiround/mtbench101_judge.py b/configs/datasets/subjective/multiround/mtbench101_judge.py new file mode 100644 index 000000000..eb1030bf4 --- /dev/null +++ b/configs/datasets/subjective/multiround/mtbench101_judge.py @@ -0,0 +1,62 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import MTBench101Dataset + + +subjective_reader_cfg = dict( + input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'], + output_column='judge', + ) + +subjective_all_sets = [ + 'mtbench101', +] +data_path ='data/subjective/' + +subjective_datasets = [] + +for _name in subjective_all_sets: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="""{dialogue}""", + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='{system_prompt}') + ], + round=[ + dict( + role='HUMAN', + prompt = '{prompt_template}' + ), + ]), + ), + ), + pred_role='BOT', + ) + + subjective_datasets.append( + dict( + abbr=f'{_name}', + type=MTBench101Dataset, + path=data_path, + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/eval_subjective_mtbench101.py b/configs/eval_subjective_mtbench101.py new file mode 100644 index 000000000..46438deb1 --- /dev/null +++ b/configs/eval_subjective_mtbench101.py @@ -0,0 +1,94 @@ +from mmengine.config import read_base + +with read_base(): + from .datasets.subjective.multiround.mtbench101_judge import subjective_datasets + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import MTBench101Summarizer + +# --------------------------------------------------------------------------------------------------------- + +api_meta_template = dict( + round=[ + dict(role='SYSTEM', api_role='SYSTEM'), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) + +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=4096, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=2, num_procs=1), + ) +] + +datasets = [*subjective_datasets] + +infer = dict( + partitioner=dict(type=SizePartitioner, max_task_size=10000), + runner=dict( + type=SlurmSequentialRunner, + partition='llm_dev2', + quotatype='auto', + max_num_workers=32, + task=dict(type=OpenICLInferTask), + ), +) + +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', # To compare with the official leaderboard, please use gpt-4-1106-preview + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=16, + max_out_len=4096, + max_seq_len=4096, + batch_size=8, + temperature=0.8, +)] + +## ------------- Evaluation Configuration + + + +eval = dict( + partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100000, mode='singlescore', models=models, judge_models=judge_models), + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict(type=MTBench101Summarizer, judge_type='single') + +work_dir = 'outputs/mtbench101/' diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 439d6b2be..54b3d37c1 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -6,5 +6,6 @@ from .creationbench import CreationBenchDataset # noqa: F401, F403 from .information_retrival import IRDataset # noqa: F401, F403 from .mtbench import MTBenchDataset # noqa: F401, F403 +from .mtbench101 import MTBench101Dataset # noqa: F401, F403 from .multiround import MultiroundDataset # noqa: F401, F403 from .subjective_cmp import SubjectiveCmpDataset # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py new file mode 100644 index 000000000..70d1caedd --- /dev/null +++ b/opencompass/datasets/subjective/mtbench101.py @@ -0,0 +1,327 @@ +# flake8: noqa: E501 +import json +import os.path as osp +import re +from typing import Optional + +from datasets import Dataset, DatasetDict +from torch.utils.data import DataLoader, Dataset + +from opencompass.registry import LOAD_DATASET + +from ..base import BaseDataset + +skip_first_tasks = ['FR', 'CR', 'AR', 'SA', 'SC', 'CM'] + +need_ref_tasks = ['MR', 'GR'] + +judge = "Please act as an impartial judge follow this instructions: In the following conversations, only the response of the 'assistant' in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated. Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria.\ + Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations is the groud truth history which do NOT need to be evaluated." + +score_format = "\n\n Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated!! You must provide your explanation. After providing your explanation, please show the score by strictly following this format: 'Rating: [[score]]', for example 'Rating: [[6]]'. The DIALGUE need to be judged is in this format: \n *** \n DIALGUE \n ***" + +eval_CM = "The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows:\n\ +\n\ +1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments.\n\ +2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue.\n\ +3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation.\n\ +4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow.\n\ +4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context.\n\ +7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow.\n\ +10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally.\n\ +When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance.\n\ +\n\ +Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness." + +eval_SI = "\n We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows:\ +\n \ +1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task.\ +2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements.\ +\n \ +Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines:\ +\n \ +- 1-3 points: The AI assistant failed to understand the ta///sk request and neither asked relevant questions nor provided information related to the task.\ +- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant.\ +- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights.\ +- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations.\ +\n \ +Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary." + +eval_CR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ +\n \ +1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ +2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ +- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ +- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ +- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ +\n \ +Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." + +eval_FR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ +\n \ +1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ +2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ +- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ +- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ +- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ +\n \ +Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." + +eval_CC = "\nThe AI assistant's capability to resist interference will be measured against these criteria:\ +\n\n \ +1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information.\n\ +2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only.\ +\n\n \ +Scoring Guidelines:\ +\n\n \ +- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information.\n\ +- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question.\n\ +- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions.\n\ +- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant.\ +\n\n \ +Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria. " + +eval_TS = "\nThe AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows:\n \ +\n \ +1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject.\n \ +2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds.\n \ +3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature.\n \ +4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads.\n \ +Scoring Guidelines:\n \ +\n \ +1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter.\n \ +4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts.\n \ +7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction.\n \ +10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content.\n \ +When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities.\n \ +\n \ +Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria." + +eval_AR = "The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance:\n\ +\n \ +1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply.\n\ +2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references.\n\ +\n \ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content.\n\ +- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information.\n\ +- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue.\n\ +- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely.\n\ +\n \ +In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above. " + +eval_IC = "The AI assistant’s ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows:\n \ +\n \ +1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification.\n \ +2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query.\n \ +3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question.\n \ +- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified.\n \ +- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question.\n \ +- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question.\n \ +\n \ +When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue.\n \ +\n \ +Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query." + +eval_SC = "We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows:\n\ +\n\ +1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation.\n\ +2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response.\n\ +\n\ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response.\n\ +- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response.\n\ +- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer.\n\ +- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response.\n\ +\n\ +In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above." + +eval_SA = "We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows:\n\ +\n\ +1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge.\n\ +2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance.\n\ +\n\ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect.\n\ +- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer.\n\ +- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification.\n\ +- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge.\n\ +\n\ +In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above." + +eval_PI = "The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows:\n\ +\n\ +1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics.\n\ +2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts.\n\ +3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it.\n\ +4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation.\n\ +4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'.\n\ +7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'.\n\ +10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation.\n\ +When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience." + +eval_MR = "The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows:\n\ +\n\ +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem.\n\ +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles.\n\ +3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards.\n\ +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles.\n\ +7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding.\n\ +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution." + +eval_GR = "The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows:\n\ +\n\ +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format ‘### reference solution ###‘ for the specific problem.\n\ +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning.\n\ +3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning.\n\ +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles.\n\ +7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding.\n\ +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution." + +unique_prompt = { + 'CM': eval_CM, + 'SI': eval_SI, + 'AR': eval_AR, + 'TS': eval_TS, + 'CC': eval_CC, + 'CR': eval_CR, + 'FR': eval_FR, + 'SC': eval_SC, + 'SA': eval_SA, + 'MR': eval_MR, + 'GR': eval_GR, + 'IC': eval_IC, + 'PI': eval_PI, +} + + +def eval_prompt_construct(task, ref_answer, history): + + if task in need_ref_tasks: + system_prompt = judge + unique_prompt[task] + score_format + prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***\n\n\ + The reference solution is: \n ### \n {ref_answer} \n ###\n\n'.format( + history=history, prediction='{prediction}', ref_answer=ref_answer) + + else: + system_prompt = judge + unique_prompt[task] + score_format + prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***'.format( + history=history, prediction='{prediction}') + + return system_prompt, prompt_template + + +def add_format(question, answer): + history = [dict(role='user', content=question)] + if answer: + history += [dict(role='assistant', content=answer)] + return history + + +@LOAD_DATASET.register_module() +class MTBench101Dataset(BaseDataset): + + def load(self, path: str, name: str): + import copy + + filename = osp.join(path, f'{name}.jsonl') + # filename = osp.join(path, 'mtbench101.jsonl') + dataset = DatasetDict() + raw_data = [] + + lines = open(filename, 'r', encoding='utf-8').readlines() + conversations = [] + for line in lines: + line = json.loads(line) + conversations.append(line) + + for dialogue in conversations: + multi_id = dialogue['id'] + task = dialogue['task'] + if task in skip_first_tasks: + skip_first = True + else: + skip_first = False + + current_multi_id = None + pre_dia = [] + history = '' + dia_list = [] + for turn_index, turn in enumerate(dialogue['history']): + human = turn['user'] + assistant = turn['bot'] + turn_id = str(turn_index + 1) + + if current_multi_id is not None and multi_id != current_multi_id: + pre_dia = [] + history = '' + + current_multi_id = multi_id + + if skip_first and turn_index == 0: + pre_dia = add_format(question=human, answer=assistant) + history = '\n\n Human: ' + human + '\n\nAssistant: ' + assistant + continue + + history = history + '\n\n Human: ' + human + '\n\nAssistant: ' + pre_dia += add_format(question=human, answer=assistant) + + pre_dia_copy = copy.deepcopy(pre_dia) + + system_prompt, prompt_template = eval_prompt_construct( + task, pre_dia, history) + + raw_data.append({ + 'dialogue': pre_dia_copy, + 'task': task, + 'multi_id': current_multi_id, + 'turn_id': turn_id, + 'system_prompt': system_prompt, + 'prompt_template': prompt_template, + 'judge': { + 'task': task, + 'multi_id': current_multi_id, + 'turn_id': turn_id, + } + }) + history = history + assistant + + dataset = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 88de42f56..d341985ad 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -10,4 +10,5 @@ from .flames import FlamesSummarizer from .information_retrival import IRSummarizer from .mtbench import MTBenchSummarizer +from .mtbench101 import MTBench101Summarizer from .multiround import MultiroundSummarizer diff --git a/opencompass/summarizers/subjective/mtbench101.py b/opencompass/summarizers/subjective/mtbench101.py new file mode 100644 index 000000000..a77f5617c --- /dev/null +++ b/opencompass/summarizers/subjective/mtbench101.py @@ -0,0 +1,142 @@ +# flake8: noqa: E501 +import csv +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import numpy as np +from mmengine import ConfigDict + +try: + from prettytable import from_csv +except ImportError: + from_csv = None + +from opencompass.utils import model_abbr_from_cfg + +from .compass_arena import CompassArenaSummarizer +from .utils import get_judgeanswer_and_reference, get_outdir + +# from .utils.writer import Writer + + +def post_process_mtbench_pair(judgement: str): + """Input a string like below: + + xxx[[A]]xxx, and extract the judge + """ + pattern = r'\[([A-C]+)\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + return matched_result[0] + else: + return None + + +def post_process_mtbench101(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + match = re.search(r'\[([0-9]+)\]', judgement) + if match: + score = int(match.group(1)) + + else: + return None + + return {'score': score, 'judgement': judgement} + + +def get_final_results(judged_answers, references, output_dir, fout_flag, + model): + + task_multi_id_scores = defaultdict(list) + task_scores = defaultdict(list) + + for ans, ref in zip(judged_answers, references): + + task = ref['task'] + multi_id = ref['multi_id'] + score = ans['score'] + + task_multi_id_scores[(task, multi_id)].append(score) + + for (task, multi_id), scores in task_multi_id_scores.items(): + min_score = min(scores) + task_scores[task].append(min_score) + + final_task_scores = { + task: sum(scores) / len(scores) if scores else 0 + for task, scores in task_scores.items() + } + + fout = osp.join(output_dir, 'task_score.csv') + + columns = list(final_task_scores.keys()) + + print('================task_score=====================') + print(final_task_scores) + + with open(fout, 'a+', newline='') as csvfile: + + writer = csv.writer(csvfile) + if fout_flag == 0: + writer.writerow(['model'] + columns) + writer.writerow([model] + + [final_task_scores[column] for column in columns]) + return 0 + + +class MTBench101Summarizer(CompassArenaSummarizer): + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single') -> None: + + self.tasks = [] + self.cfg = config + + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] + + self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) + + self.judge_function = post_process_mtbench101 + + def summarize(self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + dataset_cfgs = self.cfg['datasets'] + output_dir, results_folder = get_outdir(self.cfg, time_str) + fout_flag = 0 + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + model, judge_model = eval_model_abbr, self.judge_abbr + + for dataset in dataset_cfgs: + print() + judged_answers, references = get_judgeanswer_and_reference( + dataset, subdir_path, self.judge_function) + get_final_results(judged_answers, references, output_dir, + fout_flag, model) + fout_flag += 1 + else: + print(subdir_path + ' is not exist! please check!') From 982e0245402b7085ef76b7a63a5bcc4812d8662f Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:40:48 +0800 Subject: [PATCH 11/12] [Feature] add dataset Fofo (#1224) * add fofo dataset * add dataset fofo --- configs/datasets/subjective/fofo/README.md | 30 ++++ .../datasets/subjective/fofo/fofo_judge.py | 96 +++++++++++ configs/eval_subjective_fofo.py | 69 ++++++++ opencompass/datasets/subjective/__init__.py | 1 + .../datasets/subjective/compassbench.py | 3 +- opencompass/datasets/subjective/fofo.py | 36 ++++ .../openicl/icl_evaluator/lm_evaluator.py | 1 + .../summarizers/subjective/__init__.py | 1 + opencompass/summarizers/subjective/fofo.py | 154 ++++++++++++++++++ 9 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 configs/datasets/subjective/fofo/README.md create mode 100644 configs/datasets/subjective/fofo/fofo_judge.py create mode 100644 configs/eval_subjective_fofo.py create mode 100644 opencompass/datasets/subjective/fofo.py create mode 100644 opencompass/summarizers/subjective/fofo.py diff --git a/configs/datasets/subjective/fofo/README.md b/configs/datasets/subjective/fofo/README.md new file mode 100644 index 000000000..cc39b20be --- /dev/null +++ b/configs/datasets/subjective/fofo/README.md @@ -0,0 +1,30 @@ +# Fofo +## Introduction +This paper presents FoFo, a pioneering benchmark for evaluating large language models' (LLMs) ability to follow complex, domain-specific formats, a crucial yet underexamined capability for their application as AI agents. Despite LLMs' advancements, existing benchmarks fail to assess their format-following proficiency adequately. FoFo fills this gap with a diverse range of real-world formats and instructions, developed through an AI-Human collaborative method. Our evaluation across both open-source (e.g., Llama 2, WizardLM) and closed-source (e.g., GPT-4, PALM2, Gemini) LLMs highlights three key findings: open-source models significantly lag behind closed-source ones in format adherence; LLMs' format-following performance is independent of their content generation quality; and LLMs' format proficiency varies across different domains. These insights suggest the need for specialized tuning for format-following skills and highlight FoFo's role in guiding the selection of domain-specific AI agents. + +## Official link +https://github.com/SalesforceAIResearch/FoFo/tree/main + +### Paper +https://arxiv.org/abs/2402.18667 + +## Examples +Input example I: +``` +Create a detailed medical diagnostic report in JSON format for a hypothetical patient based on the following clinical scenario and laboratory results. \n\n**Clinical Scenario:**\n- Patient Identifier: 12345X\n- Gender: Female\n- Age: 40 years\n- Presenting Complaint: Acute onset of sharp, right lower quadrant abdominal pain that began approximately 6 hours ago\n- Past Medical History: Hypertension, well-controlled on medication; no known allergies; nonsmoker; nulliparous\n- Recent Labs: Slight leukocytosis, normal hemoglobin, elevated C-reactive protein\n- Imaging: Ultrasound indicates a thickened wall of the appendix with peri-appendiceal fluid collection\n- Surgery: The patient underwent an emergency laparoscopic appendectomy\n- Pathology Report: Confirmed acute appendicitis with peri-appendiceal abscess formation, no malignancy noted\n\nUsing the sample data provided above, generate a JSON object with the following structure:\n\n1. Basic Patient Information:\n - PatientID\n - Gender\n - Age\n\n2. Encounter details:\n - Presentation\n - Symptoms\n - DurationOfSymptoms\n - PastMedicalHistory\n - RecentLaboratoryFindings (nested object with LabName as key and the value as another object containing Result and ReferenceRange)\n - ImagingResults (array of objects with ImagingType, Findings, and Conclusion)\n\n3. Surgical Intervention:\n - SurgeryType\n - Urgency (Elective or Emergency)\n - Outcome (Success or Complication, followed by a brief description)\n\n4. Pathology Report:\n - Diagnosis\n - Findings\n - Implications\n \n5. Recommendations:\n - Follow up actions\n - Medications (array of objects with DrugName, Dosage, Frequency, and Duration)\n - Warnings (e.g., warning signs of complications to watch for post-operation)\n\nYou are allowed to make up any additional relevant data, such as reference ranges for the lab results or specific medication names and dosages for the follow-up recommendations. The JSON must strictly adhere to proper JSON formatting rules, including the use of curly brackets for objects, square brackets for arrays, and double quotes for strings. Keep key names succinct and ensure they accurately reflect their corresponding data. The output should be directly usable as a comprehensive digital diagnostic report for the hypothetical patient described. +``` + +Output example I (from GPT-4): +``` +\n \"BasicPatientInformation\": {\n \"PatientID\": \"12345X\",\n \"Gender\": \"Female\",\n \"Age\": 40\n },\n \"EncounterDetails\": {\n \"Presentation\": \"Acute onset of sharp, right lower quadrant abdominal pain\",\n \"Symptoms\": \"Sharp abdominal pain\",\n \"DurationOfSymptoms\": \"6 hours\",\n \"PastMedicalHistory\": \"Hypertension, well-controlled on medication; no known allergies; nonsmoker; nulliparous\",\n \"RecentLaboratoryFindings\": {\n \"LeukocyteCount\": {\n \"Result\": \"Slightly elevated\",\n \"ReferenceRange\": \"4.5-11.0 x 10^9\/L\"\n },\n \"Hemoglobin\": {\n \"Result\": \"Normal\",\n \"ReferenceRange\": \"12.0-16.0 g\/dL\"\n },\n \"CReactiveProtein\": {\n \"Result\": \"Elevated\",\n \"ReferenceRange\": \"0-5 mg\/L\"\n }\n },\n \"ImagingResults\": [\n {\n \"ImagingType\": \"Ultrasound\",\n \"Findings\": \"Thickened wall of the appendix with peri-appendiceal fluid collection\",\n \"Conclusion\": \"Appendix inflammation indicative of acute appendicitis\"\n }\n ]\n },\n \"SurgicalIntervention\": {\n \"SurgeryType\": \"Laparoscopic Appendectomy\",\n \"Urgency\": \"Emergency\",\n \"Outcome\": \"Success, no post-operative complications noted\"\n },\n \"PathologyReport\": {\n \"Diagnosis\": \"Acute appendicitis with peri-appendiceal abscess formation\",\n \"Findings\": \"Inflammation and abscess formation in the appendix, no malignancy\",\n \"Implications\": \"Appendix successfully removed, no malignancy found\"\n },\n \"Recommendations\": {\n \"FollowUpActions\": \"Patient should rest and recover, and attend a follow-up appointment in 2 weeks\",\n \"Medications\": [\n {\n \"DrugName\": \"Ibuprofen\",\n \"Dosage\": \"400 mg\",\n \"Frequency\": \"Every 6 hours\",\n \"Duration\": \"1 week\"\n },\n {\n \"DrugName\": \"Paracetamol\",\n \"Dosage\": \"500 mg\",\n \"Frequency\": \"Every 4 hours\",\n \"Duration\": \"1 week\"\n }\n ],\n \"Warnings\": \"Contact healthcare provider if pain persists or worsens, if fever develops, or if there are any signs of infection at the surgical site\"\n }\n +``` + +## Reference +``` +@article{xia2024fofo, + title={FOFO: A Benchmark to Evaluate LLMs' Format-Following Capability}, + author={Xia, Congying and Xing, Chen and Du, Jiangshu and Yang, Xinyi and Feng, Yihao and Xu, Ran and Yin, Wenpeng and Xiong, Caiming}, + journal={arXiv preprint arXiv:2402.18667}, + year={2024} +} +``` diff --git a/configs/datasets/subjective/fofo/fofo_judge.py b/configs/datasets/subjective/fofo/fofo_judge.py new file mode 100644 index 000000000..c66fe9ba7 --- /dev/null +++ b/configs/datasets/subjective/fofo/fofo_judge.py @@ -0,0 +1,96 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import FofoDataset +from mmengine.config import read_base + +subjective_reader_cfg = dict( + input_columns=['question'], + output_column='judge', + ) + +subjective_all_sets = [ + 'fofo_test_prompts', 'fofo_test_prompts_cn', +] + +base_prompt = """ +I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose. + +Here is the prompt: +{ + "instruction": "{question}", +} + +Here are the outputs of the models: +[ + { + "model": "model", + "answer": "{prediction}" + }, +] + +Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output: +```json +[ + { + 'model': , + 'format_correctness': , + 'reasons': + } +] +``` + +Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python. +""" + +subjective_datasets = [] + +for _name in subjective_all_sets: + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{question}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=4096), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = base_prompt + ), + ]), + ), + ), + pred_role='BOT', + ) + + subjective_datasets.append( + dict( + abbr=f'{_name}', + type=FofoDataset, + path='./data/subjective/fofo', + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/eval_subjective_fofo.py b/configs/eval_subjective_fofo.py new file mode 100644 index 000000000..1cbef1a27 --- /dev/null +++ b/configs/eval_subjective_fofo.py @@ -0,0 +1,69 @@ +from mmengine.config import read_base + +with read_base(): + from .datasets.subjective.fofo.fofo_judge import subjective_datasets + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import FofoSummarizer + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) + +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='internlm2-chat-1.8b-hf', + path='internlm/internlm2-chat-1_8b', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=1), + stop_words=['', '<|im_end|>'], + generation_kwargs=dict( + do_sample=True, + ), + ) +] + +datasets = [*subjective_datasets] + +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8, + temperature=0, +)] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models, + ), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict(type=FofoSummarizer, judge_type='general') + +work_dir = 'outputs/fofo/' diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py index 54b3d37c1..5219700c5 100644 --- a/opencompass/datasets/subjective/__init__.py +++ b/opencompass/datasets/subjective/__init__.py @@ -4,6 +4,7 @@ from .compassbench import CompassBenchDataset # noqa: F401, F403 from .corev2 import Corev2Dataset # noqa: F401, F403 from .creationbench import CreationBenchDataset # noqa: F401, F403 +from .fofo import FofoDataset # noqa: F401, F403 from .information_retrival import IRDataset # noqa: F401, F403 from .mtbench import MTBenchDataset # noqa: F401, F403 from .mtbench101 import MTBench101Dataset # noqa: F401, F403 diff --git a/opencompass/datasets/subjective/compassbench.py b/opencompass/datasets/subjective/compassbench.py index bd6f6a1d9..fba999cfe 100644 --- a/opencompass/datasets/subjective/compassbench.py +++ b/opencompass/datasets/subjective/compassbench.py @@ -20,7 +20,7 @@ {prediction2} [回答2结束] -根据评分要求,请先对两个回答进行评价,最后在以下 3 个选项中做出选择: +请先对两个回答进行评价,最后在以下 3 个选项中做出选择: A. 回答1更好 B. 回答2更好 C. 回答1、2平局 @@ -87,6 +87,7 @@ def load(self, path: str, name: str): lan = problem['language'] others = problem['others'] judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en + judge_prompt = judge_prompt.replace('{question}', question) raw_data.append({ 'question': question, 'judge_prompt': judge_prompt, diff --git a/opencompass/datasets/subjective/fofo.py b/opencompass/datasets/subjective/fofo.py new file mode 100644 index 000000000..ad92fffa0 --- /dev/null +++ b/opencompass/datasets/subjective/fofo.py @@ -0,0 +1,36 @@ +# flake8: noqa +import json +import os.path as osp + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET + +from ..base import BaseDataset + + +@LOAD_DATASET.register_module() +class FofoDataset(BaseDataset): + + def load(self, path: str, name: str): + filename = osp.join(path, f'{name}.json') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for problem in json_data: + question = problem['instruction'] + lan = 'cn' if 'cn' in name else 'en' + raw_data.append({ + 'question': question, + 'judge': { + 'lan': lan, + 'id': problem['id'], + 'domain': problem['domain'], + 'sub_domain': problem['sub_domain'], + 'format': problem['format'], + 'format_type': problem['format_type'], + 'question': question + } + }) + dataset = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index bd89533c6..5d9037388 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -215,6 +215,7 @@ def score(self, for k, v in pred_dict.items(): dataset.reader.dataset['test'] = dataset.test.add_column(k, v) dataset.reader.input_columns.append(k) + if references: dataset.reader.input_columns.append('reference') dataset.reader.dataset['test'] = dataset.test.add_column( diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index d341985ad..7457f14f1 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -8,6 +8,7 @@ from .corev2 import Corev2Summarizer from .creationbench import CreationBenchSummarizer from .flames import FlamesSummarizer +from .fofo import FofoSummarizer from .information_retrival import IRSummarizer from .mtbench import MTBenchSummarizer from .mtbench101 import MTBench101Summarizer diff --git a/opencompass/summarizers/subjective/fofo.py b/opencompass/summarizers/subjective/fofo.py new file mode 100644 index 000000000..a00f622d1 --- /dev/null +++ b/opencompass/summarizers/subjective/fofo.py @@ -0,0 +1,154 @@ +# flake8: noqa: E501 +import csv +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import numpy as np +from mmengine import ConfigDict +from tabulate import tabulate + +try: + from prettytable import from_csv +except ImportError: + from_csv = None + +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg + +from .compass_arena import CompassArenaSummarizer +from .utils import get_judgeanswer_and_reference, get_outdir + +# from .utils.writer import Writer + + +def post_process_fofo(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + match = re.search(r"[\"']format_correctness[\"']:\s*([0-1]+)", judgement) + if match: + score = int(match.group(1)) + else: + return None + + return {'score': score, 'judgement': judgement} + + +class FofoSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single') -> None: + + self.tasks = [] + self.cfg = config + + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] + + self.judge_models = self.cfg.get('judge_models', None) + + self.judge_function = post_process_fofo + + def get_score(self, time_str): + output_dir, results_folder = get_outdir(self.cfg, time_str) + total_scores = {} + for idx, judge_model_cfg in enumerate(self.judge_models): + judge_model = model_abbr_from_cfg(judge_model_cfg) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + judged_answers, references = get_judgeanswer_and_reference( + dataset, subdir_path, self.judge_function) + scores = defaultdict(list) + for ans, ref in zip(judged_answers, references): + domain = ref['domain'] + format_name = ref['format'] + format_type = ref['format_type'] + score = ans['score'] + if score is not None: + scores['overall'].append(score) + scores[domain].append(score) + if format_type == 'general': + scores[format_name].append(score) + single_model_scores = { + task: sum(score) / len(score) + for task, score in scores.items() + } + if judge_model not in total_scores: + total_scores[judge_model] = {} + if dataset_abbr not in total_scores[judge_model]: + total_scores[judge_model][dataset_abbr] = {} + total_scores[judge_model][dataset_abbr][ + eval_model_abbr] = single_model_scores + else: + print(subdir_path + ' is not exist! please check!') + return total_scores + + def summarize(self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + scores = self.get_score(time_str) + print(scores) + output_dir, results_folder = get_outdir(self.cfg, time_str) + for idx, judge_model in enumerate(self.judge_models): + judge_abbr = model_abbr_from_cfg(judge_model) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + summarizer_model_abbrs = self.eval_model_abbrs + one_column = list(scores[judge_abbr][dataset_abbr].values())[0] + format_types = ['Json', 'CSV', 'XML', 'YAML', 'Markdown'] + row_headers = [ + i for i in one_column.keys() + if i not in [dataset_abbr] + format_types + ] + row_headers = ['overall'] + format_types + row_headers + headers = [dataset_abbr] + summarizer_model_abbrs + table = [] + for row_header in row_headers: + row = [row_header] + for model_abbr in summarizer_model_abbrs: + s = scores[judge_abbr][dataset_abbr][model_abbr].get( + row_header, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join( + output_dir, 'summarized-by--' + judge_abbr + '-' + + dataset_abbr + '-report.csv') + else: + output_filename = osp.join( + output_dir, 'judged-by--' + judge_abbr + '-' + + dataset_abbr + '-report.csv') + + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) From e0d7808b4e421546c9a4f5b364f72be873d581e8 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:48:07 +0800 Subject: [PATCH 12/12] [Fix] fix pip version (#1228) * fix pip version * fix pip version --- configs/eval_subjective_alpacaeval_official.py | 3 +++ requirements/extra.txt | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/configs/eval_subjective_alpacaeval_official.py b/configs/eval_subjective_alpacaeval_official.py index eac878cc7..f1504fda1 100644 --- a/configs/eval_subjective_alpacaeval_official.py +++ b/configs/eval_subjective_alpacaeval_official.py @@ -22,6 +22,9 @@ reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) + +# To run this config, please ensure to successfully installed `alpaca-eval==0.6` and `scikit-learn==1.5` + # -------------Inference Stage ---------------------------------------- # For subjective evaluation, we often set do sample for models diff --git a/requirements/extra.txt b/requirements/extra.txt index 6b3409e74..84f88e6ad 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,2 +1,3 @@ -alpaca-eval +alpaca-eval==0.6 faiss_gpu==1.7.2 +scikit-learn==1.5