From a793df561a60a34b78b05d3f569c4774fb9aee13 Mon Sep 17 00:00:00 2001 From: wangyudong Date: Thu, 14 Dec 2023 09:08:22 +0000 Subject: [PATCH] Bump sft-opencompass-v0.5.2 --- .../ds1000_compl_service_eval_gen_cbc84f.py | 2 +- .../ds1000/ds1000_service_eval_gen_cbc84f.py | 2 +- .../humaneval/humaneval_gen_6d1cc2.py | 36 +++ configs/datasets/mbpp/mbpp_gen_caa7ab.py | 65 +++++ .../123B_eval/eval_internlm-chat_123b.py | 12 +- .../eval_internlm-chat_1b_code-only.py | 4 +- .../20B_eval/eval_internlm-chat_20b.py | 12 +- .../70B_eval/eval_internlm-chat_70b.py | 11 +- .../eval_internlm-chat_7b_chatml.py | 56 ++++ .../sft_cfg/7B_eval/eval_internlm-chat_7b.py | 10 +- .../7B_eval/eval_internlm-chat_7b_aliyun.py | 6 +- .../7B_eval/eval_internlm-chat_7b_llmv2.py | 8 +- ...eval_internlm-chat_7b_safety_subjective.py | 6 +- .../eval_internlm-chat_7b_subjective.py | 6 +- configs/sft_cfg/clusters/aliyun_llm.py | 18 +- configs/sft_cfg/clusters/slurm_llmit2.py | 23 +- .../dataset_collections/code_core_set.py | 8 +- .../code_only_llama_base.py | 5 + .../code_prompt_engineering.py | 11 + .../dataset_collections/code_with_coreset.py | 8 +- .../medium_chat_sft_v052.py | 74 +++++ .../eval_llama2-internlm-chat_7b_code-only.py | 4 +- .../eval_llama2-internlm_7b_code-only.py | 4 +- .../eval_internlm-chat_7b_only-plugineval.py | 6 +- .../eval_internlm-chat_7b_with-plugineval.py | 6 +- .../summarizers/medium_chat_sft_v052.py | 275 ++++++++++++++++++ 26 files changed, 630 insertions(+), 48 deletions(-) create mode 100644 configs/datasets/humaneval/humaneval_gen_6d1cc2.py create mode 100644 configs/datasets/mbpp/mbpp_gen_caa7ab.py create mode 100644 configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py create mode 100644 configs/sft_cfg/dataset_collections/code_prompt_engineering.py create mode 100644 configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py rename configs/sft_cfg/{7B_eval => plugin_eval}/eval_internlm-chat_7b_only-plugineval.py (89%) rename configs/sft_cfg/{7B_eval => plugin_eval}/eval_internlm-chat_7b_with-plugineval.py (89%) create mode 100644 configs/sft_cfg/summarizers/medium_chat_sft_v052.py diff --git a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py index 96f137b61..e19a7d222 100644 --- a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py +++ b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py @@ -34,7 +34,7 @@ # If the IP address is not accessible, # follow the instructions below to launch a code evaluate service. # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb - "http://10.140.60.1", # T cluster, http://10.140.0.133 for S cluster + "http://10.140.60.10", # T cluster, http://10.140.0.133 for S cluster # INTERNAL_END port=5000 ), diff --git a/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py index c137bf7c8..b38fb662f 100644 --- a/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py +++ b/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py @@ -34,7 +34,7 @@ # If the IP address is not accessible, # follow the instructions below to launch a code evaluate service. # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb - "http://10.140.60.1", # T cluster, http://10.140.0.133 for S cluster + "http://10.140.60.10", # T cluster, http://10.140.0.133 for S cluster # INTERNAL_END port=5000 ), diff --git a/configs/datasets/humaneval/humaneval_gen_6d1cc2.py b/configs/datasets/humaneval/humaneval_gen_6d1cc2.py new file mode 100644 index 000000000..9740039ed --- /dev/null +++ b/configs/datasets/humaneval/humaneval_gen_6d1cc2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess + +humaneval_reader_cfg = dict( + input_columns=['prompt'], output_column='task_id', train_split='test') + +# TODO: allow empty output-column +humaneval_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +humaneval_eval_cfg = dict( + evaluator=dict(type=HumanEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +humaneval_datasets = [ + dict( + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', + reader_cfg=humaneval_reader_cfg, + infer_cfg=humaneval_infer_cfg, + eval_cfg=humaneval_eval_cfg) +] diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py b/configs/datasets/mbpp/mbpp_gen_caa7ab.py new file mode 100644 index 000000000..9c24f7ac7 --- /dev/null +++ b/configs/datasets/mbpp/mbpp_gen_caa7ab.py @@ -0,0 +1,65 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)\n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result\n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums\n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n" + ), + dict(role="BOT", prompt="[BEGIN]\n"), + + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py b/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py index f85f181bc..f126bff25 100644 --- a/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py +++ b/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py @@ -4,8 +4,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.slurm_llmit2 import infer, eval from ..lark import lark_bot_url @@ -25,15 +25,17 @@ model_type='LLAMA', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', tokenizer_type='v4', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/plato_123b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/plato_123b_8k_sft.py", # if got w2w3 miss match error, set w2w3_bug=True w2w3_bug=False, meta_template=without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, - batch_size=8, + batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=8, num_procs=8)) diff --git a/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py b/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py index 01d7e9779..3d8c49437 100644 --- a/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py +++ b/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py @@ -16,11 +16,13 @@ model_type='LLAMA', tokenizer_path='/mnt/petrelfs/share_data/yanhang/tokenizes/llama.model', tokenizer_type='llama', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", model_config="/mnt/petrelfs/wangyudong/opencompass/configs/sft_cfg/1B_eval/1B_model_config.py", max_out_len=100, max_seq_len=2048, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py b/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py index c560e3fc1..a37fd171f 100644 --- a/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py +++ b/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py @@ -5,8 +5,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.slurm_llmit2 import infer, eval from ..lark import lark_bot_url @@ -24,13 +24,15 @@ model_type='LLAMA', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/newton_20b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/newton_20b_8k_sft.py", meta_template=without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, - batch_size=8, + batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=4, num_procs=4)) diff --git a/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py b/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py index 10add7ab8..6bde10d4f 100644 --- a/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py +++ b/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py @@ -4,8 +4,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.slurm_llmit2 import infer, eval from ..lark import lark_bot_url @@ -25,14 +25,15 @@ model_type="LLAMA", tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', tokenizer_type='v4', - # TODO: support relative path in train_internlm - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/euclid_70b_v2_0_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/euclid_70b_v2_0_sft.py", meta_template=without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict(num_gpus=4, num_procs=4), ) diff --git a/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py b/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py new file mode 100644 index 000000000..874b572e6 --- /dev/null +++ b/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py @@ -0,0 +1,56 @@ +from mmengine.config import read_base +from opencompass.models.internal import InternLMwithModule +from copy import deepcopy +import os.path as osp + +with read_base(): + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer + from ..clusters.slurm_llmit2 import infer, eval + from ..lark import lark_bot_url + +without_meta_template = dict( + begin="""""", + round=[ + dict(role='HUMAN', begin='user\n', end='\n'), + dict(role='BOT', begin='assistant\n', end='\n', generate=True), + ], + eos_token_id=103166) + +base_dict = dict( + abbr=None, + path=None, + type=InternLMwithModule, + model_type='INTERNLM', + tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', + tokenizer_type='v7', + # TODO: add model config in the shared path + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_16k_sft.py", + meta_template=without_meta_template, + max_out_len=100, + # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. + max_seq_len=8192, + batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', + run_cfg=dict( + num_gpus=1, + num_procs=1)) + +models_path = [ + '/mnt/petrelfs/share_data/wangyudong/ckpt/20231206/v0.16_dev2_16k/sft_7b_16k_0.16dev2_rc5/10270', +] + +models = [] + +for model_path in models_path: + tmp_model_dict = deepcopy(base_dict) + if model_path.endswith('/'): + model_path = model_path[:-1] + abbr = osp.split(osp.split(model_path)[0])[-1] + tmp_model_dict['abbr'] = abbr + tmp_model_dict['path'] = model_path + models.append(tmp_model_dict) + +del models_path, model_path, tmp_model_dict, abbr, base_dict diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py index 54db35615..a4dddf049 100644 --- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py +++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py @@ -4,8 +4,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.slurm_llmit2 import infer, eval from ..lark import lark_bot_url @@ -24,13 +24,15 @@ model_type='INTERNLM', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py", meta_template=without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py index 8722ba29f..66419ea87 100644 --- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py +++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py @@ -4,8 +4,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.aliyun_llm import infer, eval from ..lark import lark_bot_url @@ -31,6 +31,8 @@ # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py index b01f01096..8daf815ba 100644 --- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py +++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py @@ -4,8 +4,8 @@ import os.path as osp with read_base(): - from ..dataset_collections.medium_chat_sft_v051 import datasets - from ..summarizers.medium_chat_sft_v051 import summarizer + from ..dataset_collections.medium_chat_sft_v052 import datasets + from ..summarizers.medium_chat_sft_v052 import summarizer from ..clusters.slurm_llmit2 import infer, eval from ..lark import lark_bot_url @@ -24,7 +24,7 @@ model_type='origin', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", meta_template=without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. @@ -35,7 +35,7 @@ num_procs=1)) models_path = [ - '/mnt/petrelfs/llmit/ckpt/7b-8k/sft_7b_v0_11/4930', + '/mnt/petrelfs/llmit/ckpt/maibao_kaoshi_7_5_ST_8k_v0213rc8/5260', ] models = [] diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py index e885e4e9a..cfff7da1f 100644 --- a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py +++ b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py @@ -35,13 +35,15 @@ model_type='INTERNLM', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py", meta_template=meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py index 6bd8faf67..fa7e67943 100644 --- a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py +++ b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py @@ -32,13 +32,15 @@ model_type='INTERNLM', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py", meta_template=meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/clusters/aliyun_llm.py b/configs/sft_cfg/clusters/aliyun_llm.py index 14bc1304e..4d2d32130 100644 --- a/configs/sft_cfg/clusters/aliyun_llm.py +++ b/configs/sft_cfg/clusters/aliyun_llm.py @@ -1,4 +1,4 @@ -from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.partitioners import SizePartitioner, NaivePartitioner, InferTimePartitioner from opencompass.runners import DLCRunner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask @@ -12,7 +12,23 @@ worker_image='master0:5000/eflops/pytorch:py3.6-torch1.8-cuda11.1-rdma5.2-sshd-ubuntu18.04', ) +# new infer setting, using split strategy, may speed up the infer time infer = dict( + partitioner=dict( + type=InferTimePartitioner, + max_task_time=3600, + strategy='split'), + runner=dict( + type=DLCRunner, + max_num_workers=64, + retry=4, + aliyun_cfg=aliyun_cfg, + task=dict(type=OpenICLInferTask) + ), +) + +# origin infer setting +infer_size = dict( partitioner=dict( type=SizePartitioner, max_task_size=5000, # default = 2000 diff --git a/configs/sft_cfg/clusters/slurm_llmit2.py b/configs/sft_cfg/clusters/slurm_llmit2.py index 97226e206..a837e8aa5 100644 --- a/configs/sft_cfg/clusters/slurm_llmit2.py +++ b/configs/sft_cfg/clusters/slurm_llmit2.py @@ -1,10 +1,27 @@ -from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.partitioners import SizePartitioner, NaivePartitioner, InferTimePartitioner from opencompass.runners import SlurmSequentialRunner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask # if users want to use other partition, use `-p {PARTITION}` # in the command line can change the partition. + +# new infer setting, using split strategy, may speed up the infer time infer = dict( + partitioner=dict( + type=InferTimePartitioner, + max_task_time=3600, + strategy='split'), + runner=dict( + type=SlurmSequentialRunner, + max_num_workers=64, + retry=4, + partition='llmit2', + quotatype='auto', + task=dict(type=OpenICLInferTask)), +) + +# origin infer setting +infer_size = dict( partitioner=dict( type=SizePartitioner, max_task_size=5000, # default = 2000 @@ -15,7 +32,7 @@ max_num_workers=64, retry=4, partition='llmit2', - quotatype='reserved', + quotatype='auto', task=dict(type=OpenICLInferTask)), ) @@ -24,7 +41,7 @@ runner=dict( type=SlurmSequentialRunner, partition='llmit2', - quotatype='reserved', + quotatype='auto', max_num_workers=128, retry=2, task=dict(type=OpenICLEvalTask)), diff --git a/configs/sft_cfg/dataset_collections/code_core_set.py b/configs/sft_cfg/dataset_collections/code_core_set.py index b7094cc69..d92ecd4a7 100644 --- a/configs/sft_cfg/dataset_collections/code_core_set.py +++ b/configs/sft_cfg/dataset_collections/code_core_set.py @@ -22,8 +22,12 @@ from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ...datasets.math.math_gen_265cce import math_datasets # 代码 - from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + # the new prompt can get better performance than the old one + from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets + from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets + # old prompt setting + # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets # 综合推理 from ...datasets.bbh.bbh_gen_5b92b0 import bbh_datasets # 理解 diff --git a/configs/sft_cfg/dataset_collections/code_only_llama_base.py b/configs/sft_cfg/dataset_collections/code_only_llama_base.py index 51b6ab98c..198864e4a 100644 --- a/configs/sft_cfg/dataset_collections/code_only_llama_base.py +++ b/configs/sft_cfg/dataset_collections/code_only_llama_base.py @@ -3,4 +3,9 @@ with read_base(): from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from ...datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets + from ...datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets + from ...datasets.clozeTest_maxmin.clozeTest_maxmin_gen_c205fb import maxmin_datasets + from ...datasets.py150.py150_gen_38b13d import py150_datasets + # humanevalx need docker during eval, not fully support now + # from ...datasets.humanevalx.humanevalx_gen_0af626 import humanevalx_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/sft_cfg/dataset_collections/code_prompt_engineering.py b/configs/sft_cfg/dataset_collections/code_prompt_engineering.py new file mode 100644 index 000000000..edd003c2c --- /dev/null +++ b/configs/sft_cfg/dataset_collections/code_prompt_engineering.py @@ -0,0 +1,11 @@ +from mmengine.config import read_base + +with read_base(): + # the new prompt can get better performance than the old one + from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets + from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets + + # old prompt setting + # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/sft_cfg/dataset_collections/code_with_coreset.py b/configs/sft_cfg/dataset_collections/code_with_coreset.py index e2c343669..755cea2a9 100644 --- a/configs/sft_cfg/dataset_collections/code_with_coreset.py +++ b/configs/sft_cfg/dataset_collections/code_with_coreset.py @@ -24,8 +24,12 @@ from ...datasets.math.math_gen_265cce import math_datasets from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # 代码 - from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets - from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # the new prompt can get better performance than the old one + from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets + from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets + # old prompt setting + # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets # 理解 # 阅读理解 from ...datasets.obqa.obqa_gen_9069e4 import obqa_datasets diff --git a/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py b/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py new file mode 100644 index 000000000..3bc1a33fa --- /dev/null +++ b/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py @@ -0,0 +1,74 @@ +from mmengine.config import read_base + +with read_base(): + from ...datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from ...datasets.ceval.ceval_gen_5f30c7 import ceval_datasets + from ...datasets.agieval.agieval_gen_64afd3 import agieval_datasets + from ...datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets + from ...datasets.bbh.bbh_gen_5b92b0 import bbh_datasets + # the new prompt can get better performance than the old one + from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets + from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets + # old prompt setting + # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets + # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ...datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets + from ...datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets + from ...datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets + from ...datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets + from ...datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets + from ...datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets + from ...datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets + from ...datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets + from ...datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets + from ...datasets.FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets + from ...datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets + from ...datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets + from ...datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets + from ...datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets + from ...datasets.lambada.lambada_gen_217e11 import lambada_datasets + from ...datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets + from ...datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets + from ...datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets + from ...datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets + from ...datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets + from ...datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets + from ...datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets + from ...datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets + from ...datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets + from ...datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from ...datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets + from ...datasets.race.race_gen_69ee4f import race_datasets + from ...datasets.Xsum.Xsum_gen_31397e import Xsum_datasets + from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from ...datasets.summedits.summedits_gen_315438 import summedits_datasets + from ...datasets.math.math_gen_265cce import math_datasets + from ...datasets.TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets + from ...datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets + from ...datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets + from ...datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets + from ...datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets + from ...datasets.piqa.piqa_gen_1194eb import piqa_datasets + from ...datasets.siqa.siqa_gen_e78df3 import siqa_datasets + from ...datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets + from ...datasets.winogrande.winogrande_gen_a9ede5 import winogrande_datasets + from ...datasets.obqa.obqa_gen_9069e4 import obqa_datasets + from ...datasets.nq.nq_gen_c788f6 import nq_datasets + from ...datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from ...datasets.flores.flores_gen_806ede import flores_datasets + # ignore safety datasets as default + # from ...datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets + # from ...datasets.civilcomments.civilcomments_clp_a3c5fd import civilcomments_datasets + # from ...datasets.jigsawmultilingual.jigsawmultilingual_clp_fe50d8 import jigsawmultilingual_datasets + # from ...datasets.realtoxicprompts.realtoxicprompts_gen_7605e4 import realtoxicprompts_datasets + # from ...datasets.truthfulqa.truthfulqa_gen_5ddc62 import truthfulqa_datasets + # add subjective QA + from ..subjective_qa.subjectiveqav3_gen import subjectiveqav3_datasets + # add new code eval dataset + from ...datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets + from ...datasets.clozeTest_maxmin.clozeTest_maxmin_gen_c205fb import maxmin_datasets + from ...datasets.py150.py150_gen_38b13d import py150_datasets + # humanevalx need docker during eval, not fully support now + # from ...datasets.humanevalx.humanevalx_gen_0af626 import humanevalx_datasets + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py index 4cf2a7037..93a75d20b 100644 --- a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py +++ b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py @@ -24,8 +24,8 @@ model_type='LLAMA', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llama.model', tokenizer_type='llama', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/llama2_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/llama2_7b_8k_sft.py", max_out_len=100, max_seq_len=2048, meta_template=meta_template, diff --git a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py index e5727e0a2..14f03d1fc 100644 --- a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py +++ b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py @@ -16,8 +16,8 @@ model_type='LLAMA', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llama.model', tokenizer_type='llama', - module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101", - model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/llama2_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/llama2_7b_8k_sft.py", max_out_len=100, max_seq_len=2048, batch_size=16, diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py similarity index 89% rename from configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py rename to configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py index 516e3e65a..774d1a4bf 100644 --- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py +++ b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py @@ -62,13 +62,15 @@ model_type='INTERNLM', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm", - model_config="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm/configs/maibao_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/maibao_7b_8k_sft.py", meta_template=_without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=4, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=4, num_procs=1)) diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py similarity index 89% rename from configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py rename to configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py index c2f3c037b..22956a259 100644 --- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py +++ b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py @@ -62,13 +62,15 @@ model_type='INTERNLM', tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', tokenizer_type='v7', - module_path="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm", - model_config="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm/configs/maibao_7b_8k_sft.py", + module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm", + model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/maibao_7b_8k_sft.py", meta_template=_without_meta_template, max_out_len=100, # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048. max_seq_len=8192, batch_size=8, + # using bf16 may decrease the performance, force set to fp16 + model_dtype='torch.float16', run_cfg=dict( num_gpus=1, num_procs=1)) diff --git a/configs/sft_cfg/summarizers/medium_chat_sft_v052.py b/configs/sft_cfg/summarizers/medium_chat_sft_v052.py new file mode 100644 index 000000000..9a1e98215 --- /dev/null +++ b/configs/sft_cfg/summarizers/medium_chat_sft_v052.py @@ -0,0 +1,275 @@ +from mmengine.config import read_base + +with read_base(): + from ...summarizers.groups.agieval import agieval_summary_groups + from ...summarizers.groups.mmlu import mmlu_summary_groups + from ...summarizers.groups.cmmlu import cmmlu_summary_groups + from ...summarizers.groups.ceval import ceval_summary_groups + from ...summarizers.groups.bbh import bbh_summary_groups + from ...summarizers.groups.GaokaoBench import GaokaoBench_summary_groups + from ...summarizers.groups.flores import flores_summary_groups + from ...summarizers.groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups + from ...summarizers.groups.tydiqa import tydiqa_summary_groups + from ...summarizers.groups.xiezhi import xiezhi_summary_groups + from ...summarizers.groups.ds1000 import ds1000_summary_groups + +summarizer = dict( + dataset_abbrs=[ + # '--------- 考试 Exam ---------', # category + # 'Mixed', # subcategory + "ceval", + 'agieval', + 'mmlu', + 'mmlu_cn', # placeholder + "GaokaoBench", + 'ARC-c', + # '自建考试', # subcategory + 'compass_exam-senior-high-2023', + # '--------- 语言 Language ---------', # category + # '字词释义', # subcategory + 'WiC', + 'summedits', + # '成语习语', # subcategory + 'chid-dev', + # '语义相似度', # subcategory + 'afqmc-dev', + 'bustm-dev', + # '指代消解', # subcategory + 'cluewsc-dev', + 'WSC', + 'winogrande', + # '翻译', # subcategory + 'flores_100', + # '--------- 知识 Knowledge ---------', # category + # '知识问答', # subcategory + 'BoolQ', + 'commonsense_qa', + 'nq', + 'triviaqa', + # '多语种问答', # subcategory + 'tydiqa', # placeholder + # '--------- 推理 Reasoning ---------', # category + # '文本蕴含', # subcategory + 'cmnli', + 'ocnli', + 'ocnli_fc-dev', + 'AX_b', + 'AX_g', + 'CB', + 'RTE', + # '常识推理', # subcategory + 'story_cloze', + 'story_cloze_cn', # placeholder + 'COPA', + 'ReCoRD', + 'hellaswag', + 'piqa', + 'siqa', + 'strategyqa', + # '数学推理', # subcategory + 'math', + 'math_cn', # placeholder + 'gsm8k', + 'gsm8k_cn', # placeholder + # '定理应用', # subcategory + 'TheoremQA', + # '代码', # subcategory + 'openai_humaneval', + 'mbpp', + # '综合推理', # subcategory + "bbh", + # '--------- 理解 Understanding ---------', # category + # '阅读理解', # subcategory + 'C3', + 'CMRC_dev', + 'DRCD_dev', + 'MultiRC', + 'race-middle', + 'race-high', + 'openbookqa_fact', + # '内容总结', # subcategory + 'csl_dev', + 'lcsts', + 'Xsum', + # '内容分析', # subcategory + 'eprstmt-dev', + 'lambada', + 'tnews-dev', + # '--------- 安全 Safety ---------', # category + # '偏见', # subcategory + 'crows_pairs', + 'crowspairs_cn', # placeholder + 'civil_comments', + # '有害性', # subcategory + 'jigsaw_multilingual', + "allenai_real-toxicity-prompts", + # '真实性', # subcategory + ('truthful_qa', 'truth'), + ('truthful_qa', 'info'), + # '--------- ceval 细节 ---------', + "ceval-stem", + "ceval-social-science", + "ceval-humanities", + "ceval-other", + "ceval-hard", + # category + 'ceval-advanced_mathematics', + 'ceval-college_chemistry', + 'ceval-college_physics', + 'ceval-college_programming', + 'ceval-computer_architecture', + 'ceval-computer_network', + 'ceval-discrete_mathematics', + 'ceval-electrical_engineer', + 'ceval-high_school_biology', + 'ceval-high_school_chemistry', + 'ceval-high_school_mathematics', + 'ceval-high_school_physics', + 'ceval-metrology_engineer', + 'ceval-middle_school_biology', + 'ceval-middle_school_chemistry', + 'ceval-middle_school_mathematics', + 'ceval-middle_school_physics', + 'ceval-operating_system', + 'ceval-probability_and_statistics', + 'ceval-veterinary_medicine', + 'ceval-business_administration', + 'ceval-college_economics', + 'ceval-education_science', + 'ceval-high_school_geography', + 'ceval-high_school_politics', + 'ceval-mao_zedong_thought', + 'ceval-marxism', + 'ceval-middle_school_geography', + 'ceval-middle_school_politics', + 'ceval-teacher_qualification', + 'ceval-art_studies', + 'ceval-chinese_language_and_literature', + 'ceval-high_school_chinese', + 'ceval-high_school_history', + 'ceval-ideological_and_moral_cultivation', + 'ceval-law', + 'ceval-legal_professional', + 'ceval-logic', + 'ceval-middle_school_history', + 'ceval-modern_chinese_history', + 'ceval-professional_tour_guide', + 'ceval-accountant', + 'ceval-basic_medicine', + 'ceval-civil_servant', + 'ceval-clinical_medicine', + 'ceval-environmental_impact_assessment_engineer', + 'ceval-fire_engineer', + 'ceval-physician', + 'ceval-plant_protection', + 'ceval-sports_science', + 'ceval-tax_accountant', + 'ceval-urban_and_rural_planner', + # '--------- agieval 细节 ---------', + 'agieval-chinese', + 'agieval-english', + 'agieval-gaokao', + # category + 'agieval-aqua-rat', + 'agieval-math', + 'agieval-logiqa-en', + 'agieval-logiqa-zh', + 'agieval-jec-qa-kd', + 'agieval-jec-qa-ca', + 'agieval-lsat-ar', + 'agieval-lsat-lr', + 'agieval-lsat-rc', + 'agieval-sat-math', + 'agieval-sat-en', + 'agieval-sat-en-without-passage', + 'agieval-gaokao-chinese', + 'agieval-gaokao-english', + 'agieval-gaokao-geography', + 'agieval-gaokao-history', + 'agieval-gaokao-biology', + 'agieval-gaokao-chemistry', + 'agieval-gaokao-physics', + 'agieval-gaokao-mathqa', + 'agieval-gaokao-mathcloze', + # '--------- mmlu 细节 ---------', + 'mmlu-humanities', + 'mmlu-stem', + 'mmlu-social-science', + 'mmlu-other', + # category + 'lukaemon_mmlu_abstract_algebra', + 'lukaemon_mmlu_anatomy', + 'lukaemon_mmlu_astronomy', + 'lukaemon_mmlu_business_ethics', + 'lukaemon_mmlu_clinical_knowledge', + 'lukaemon_mmlu_college_biology', + 'lukaemon_mmlu_college_chemistry', + 'lukaemon_mmlu_college_computer_science', + 'lukaemon_mmlu_college_mathematics', + 'lukaemon_mmlu_college_medicine', + 'lukaemon_mmlu_college_physics', + 'lukaemon_mmlu_computer_security', + 'lukaemon_mmlu_conceptual_physics', + 'lukaemon_mmlu_econometrics', + 'lukaemon_mmlu_electrical_engineering', + 'lukaemon_mmlu_elementary_mathematics', + 'lukaemon_mmlu_formal_logic', + 'lukaemon_mmlu_global_facts', + 'lukaemon_mmlu_high_school_biology', + 'lukaemon_mmlu_high_school_chemistry', + 'lukaemon_mmlu_high_school_computer_science', + 'lukaemon_mmlu_high_school_european_history', + 'lukaemon_mmlu_high_school_geography', + 'lukaemon_mmlu_high_school_government_and_politics', + 'lukaemon_mmlu_high_school_macroeconomics', + 'lukaemon_mmlu_high_school_mathematics', + 'lukaemon_mmlu_high_school_microeconomics', + 'lukaemon_mmlu_high_school_physics', + 'lukaemon_mmlu_high_school_psychology', + 'lukaemon_mmlu_high_school_statistics', + 'lukaemon_mmlu_high_school_us_history', + 'lukaemon_mmlu_high_school_world_history', + 'lukaemon_mmlu_human_aging', + 'lukaemon_mmlu_human_sexuality', + 'lukaemon_mmlu_international_law', + 'lukaemon_mmlu_jurisprudence', + 'lukaemon_mmlu_logical_fallacies', + 'lukaemon_mmlu_machine_learning', + 'lukaemon_mmlu_management', + 'lukaemon_mmlu_marketing', + 'lukaemon_mmlu_medical_genetics', + 'lukaemon_mmlu_miscellaneous', + 'lukaemon_mmlu_moral_disputes', + 'lukaemon_mmlu_moral_scenarios', + 'lukaemon_mmlu_nutrition', + 'lukaemon_mmlu_philosophy', + 'lukaemon_mmlu_prehistory', + 'lukaemon_mmlu_professional_accounting', + 'lukaemon_mmlu_professional_law', + 'lukaemon_mmlu_professional_medicine', + 'lukaemon_mmlu_professional_psychology', + 'lukaemon_mmlu_public_relations', + 'lukaemon_mmlu_security_studies', + 'lukaemon_mmlu_sociology', + 'lukaemon_mmlu_us_foreign_policy', + 'lukaemon_mmlu_virology', + 'lukaemon_mmlu_world_religions', + # new code eval datasets + 'maxmin', + 'py150', + 'ds1000', + 'ds1000_Pandas', + 'ds1000_Numpy', + 'ds1000_Tensorflow', + 'ds1000_Scipy', + 'ds1000_Sklearn', + 'ds1000_Pytorch', + 'ds1000_Matplotlib' + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith("_summary_groups")], []), + prompt_db=dict( + database_path='configs/datasets/log.json', + config_dir='configs/datasets', + blacklist='.promptignore'), +)