diff --git a/.gitignore b/.gitignore
index 9e2b875f1..f64196b45 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,8 +91,12 @@ docs/zh_cn/_build/
 
 # sft config ignore list
 configs/sft_cfg/*B_*
+configs/sft_cfg/1B/*
 configs/sft_cfg/7B/*
 configs/sft_cfg/20B/*
+configs/sft_cfg/60B/*
+configs/sft_cfg/100B/*
+
 configs/cky/
 # in case llama clone in the opencompass
 llama/
@@ -120,3 +124,6 @@ turbomind/
 *.csv
 *.npy
 *.c
+
+# aliyun
+core.*
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
new file mode 100644
index 000000000..6ee5e6bef
--- /dev/null
+++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
@@ -0,0 +1,42 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "ice_template": {
+                "type": PromptTemplate,
+                "template": {"round": [{"role": "HUMAN", "prompt": p["prefix_prompt"] + "{question}"}]},
+                "ice_token": "</E>",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
new file mode 100644
index 000000000..d89798af7
--- /dev/null
+++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
@@ -0,0 +1,41 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "prompt_template": {
+                "type": PromptTemplate,
+                "template": p["prefix_prompt"] + "{question}",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_prompts.py b/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
new file mode 100644
index 000000000..c07dcfc21
--- /dev/null
+++ b/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
@@ -0,0 +1,191 @@
+
+MCQ_prompts = [
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Math_II_MCQs",
+        "prefix_prompt": "请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Math_I_MCQs",
+        "prefix_prompt": "请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_History_MCQs",
+        "prefix_prompt": "请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Biology_MCQs",
+        "prefix_prompt": "请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Political_Science_MCQs",
+        "prefix_prompt": "请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "multi_choice",
+        "keyword": "2010-2022_Physics_MCQs",
+        "prefix_prompt": "请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出所有符合题意的答案，并写在【答案】和<eoa>之间。\n例如：【答案】 AB <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Chemistry_MCQs",
+        "prefix_prompt": "请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2013_English_MCQs",
+        "prefix_prompt": "请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Chinese_Modern_Lit",
+        "prefix_prompt": "请你做一道语文阅读理解题，其中包含三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_English_Fill_in_Blanks",
+        "prefix_prompt": "请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "five_out_of_seven",
+        "keyword": "2012-2022_English_Cloze_Test",
+        "prefix_prompt": "请回答下面的问题，将符合题意的五个选项的字母写在【答案】和<eoa>之间，例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Geography_MCQs",
+        "prefix_prompt": "请你做一道地理选择题，其中包含两到三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_English_Reading_Comp",
+        "prefix_prompt": "请你做一道英语阅读理解题，其中包含三到五个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Chinese_Lang_and_Usage_MCQs",
+        "prefix_prompt": "请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n（1）【解析】 ... <eoe>\n【答案】 ... <eoa>\n（2）【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答\n题目如下：",
+    },
+]
+FBQ_prompts = [
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Math_I_Fill-in-the-Blank",
+        "prefix_prompt": "请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Math_II_Fill-in-the-Blank",
+        "prefix_prompt": "请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation",
+        "prefix_prompt": "请回答下面的语文填空题\n请你仔细阅读题目，先找到题目对应的中国名篇，再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2014-2022_English_Language_Cloze_Passage",
+        "prefix_prompt": "请回答下面的英语短文填词题\n仔细阅读题目，空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考，将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+]
+OEQ_prompts = [
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Geography_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chemistry_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Math_I_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_History_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的历史解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Biology_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Math_II_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Physics_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的物理解答题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Political_Science_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的政治解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "correction",
+        "keyword": "2012-2022_English_Language_Error_Correction",
+        "prefix_prompt": "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        # "prefix_prompt": [
+        #     "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        #     "请比较下面两篇短文，找到第二篇和第一篇的10处不同，每处不同只涉及一个单词，请将结果写在【答案】和<eoa>之间。例如：【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下：【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
+        # ],
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Ancient_Poetry_Reading",
+        "prefix_prompt": "请解答下面的语文古代诗歌阅读题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Practical_Text_Reading",
+        "prefix_prompt": "请解答下面的语文实用类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Literary_Text_Reading",
+        "prefix_prompt": "请解答下面的语文文学类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Classical_Chinese_Reading",
+        "prefix_prompt": "请解答下面的语文文言文阅读，仔细阅读题目，前三题是单选题，最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面，例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的语文解答题，仔细阅读题目，注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+]
diff --git a/configs/datasets/MathBench/mathbench_agent_gen_fbe13b.py b/configs/datasets/MathBench/deprecated_mathbench_agent_gen_fbe13b.py
similarity index 100%
rename from configs/datasets/MathBench/mathbench_agent_gen_fbe13b.py
rename to configs/datasets/MathBench/deprecated_mathbench_agent_gen_fbe13b.py
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
index 651b82123..952a27934 100644
--- a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
@@ -4,37 +4,36 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
 
-TheoremQA_prompt1 = "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. " \
-         "If the Answer type in [bool], the answer needs to be True or False. " \
-         "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. " \
-         "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. " \
-         "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)." \
-         "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
-TheoremQA_prompt2 = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
-         f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
+TheoremQA_prompt1 = (
+    "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. "
+    "If the Answer type in [bool], the answer needs to be True or False. "
+    "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. "
+    "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. "
+    "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)."
+    "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
+)
+TheoremQA_prompt2 = (
+    f"Below is an instruction that describes a task, paired with an input that provides further context. "
+    f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
+)
 
 TheoremQA_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=TheoremQA_prompt2),
+    prompt_template=dict(type=PromptTemplate, template=TheoremQA_prompt2),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
index e5dac6f02..505e6281f 100644
--- a/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
@@ -4,10 +4,7 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
 
 TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
 1. a numerical value like 0.1, no symbol and no unit at all.
@@ -15,34 +12,33 @@
 3. True/False.
 4. an option like (a), (b), (c), (d)
 """
-TheoremQA_prompt2 = 'Question: {Question}\nLet\'s think step by step.'
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 
 TheoremQA_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             begin=[
-                dict(
-                    role='SYSTEM',
-                    fallback_role='HUMAN',
-                    prompt=TheoremQA_prompt1),
+                dict(role="SYSTEM", fallback_role="HUMAN", prompt=TheoremQA_prompt1),
             ],
             round=[
-                dict(role='HUMAN', prompt=TheoremQA_prompt2),
-            ])),
+                dict(role="HUMAN", prompt=TheoremQA_prompt2),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
index ef037ee2d..60da5a449 100644
--- a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
@@ -4,34 +4,41 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 
 TheoremQA_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template=dict(round=[
-            dict(
-                role='HUMAN',
-                prompt=
-                """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:\n1. a numerical value like 0.1, no symbol and no unit at all.\n2. a list of number like [2, 3, 4].\n3. True/False.\n4. an option like (a), (b), (c), (d)\nQuestion: {Question}\nLet\'s think step by step."""
-            ),
-        ])),
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py
new file mode 100644
index 000000000..7a20656aa
--- /dev/null
+++ b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py
@@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=TheoremQA_prompt1 + TheoremQA_prompt2,
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py
new file mode 100644
index 000000000..da17c3144
--- /dev/null
+++ b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
diff --git a/configs/datasets/bbh/bbh_gen_0a5495.py b/configs/datasets/bbh/bbh_gen_0a5495.py
new file mode 100644
index 000000000..3d43691e7
--- /dev/null
+++ b/configs/datasets/bbh/bbh_gen_0a5495.py
@@ -0,0 +1,89 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+
+bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role="BOT",
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"./data/BBH/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"./data/BBH/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
diff --git a/configs/datasets/ceval/ceval_ppl_1cd8bf.py b/configs/datasets/ceval/ceval_ppl_1cd8bf.py
new file mode 100644
index 000000000..0c7b6ec2a
--- /dev/null
+++ b/configs/datasets/ceval/ceval_ppl_1cd8bf.py
@@ -0,0 +1,102 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+
+ceval_datasets = []
+for _split in ["val", "test"]:
+    for _name in ceval_all_sets:
+        ceval_reader_cfg = dict(
+            input_columns=["question", "A", "B", "C", "D"],
+            output_column="answer",
+            train_split="dev",
+            test_split=_split,
+        )
+
+        _ch_name = ceval_subject_mapping[_name][1]
+
+        hint = f"以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。"
+        question_and_options = "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{question_and_options}\n答案: {answer}\n" for answer in ["A", "B", "C", "D"]},
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{hint}\n</E>{question_and_options}\n答案: {answer}" for answer in ["A", "B", "C", "D"]},
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+
+        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path="./data/ceval/formal_ceval",
+                name=_name,
+                abbr="ceval-" + _name if _split == "val" else "ceval-test-" + _name,
+                reader_cfg=ceval_reader_cfg,
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
diff --git a/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py b/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
new file mode 100644
index 000000000..6958033d2
--- /dev/null
+++ b/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
@@ -0,0 +1,117 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    hint = f"以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。"
+    question_and_options = "题目：{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={answer: f"{question_and_options}\n答案是: {answer}\n" for answer in ["A", "B", "C", "D"]},
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={answer: f"{hint}\n</E>{question_and_options}\n答案是: {answer}" for answer in ["A", "B", "C", "D"]},
+            ice_token="</E>",
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path="./data/cmmlu/",
+            name=_name,
+            abbr=f"cmmlu-{_name}",
+            reader_cfg=dict(
+                input_columns=["question", "A", "B", "C", "D"],
+                output_column="answer",
+                train_split="dev",
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+
+del _name, _ch_name
diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py
new file mode 100644
index 000000000..fc490e77a
--- /dev/null
+++ b/configs/datasets/collections/base_core.py
@@ -0,0 +1,20 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
+    from ..ceval.ceval_ppl_1cd8bf import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_20a989 import nq_datasets
+    from ..race.race_ppl_abed12 import race_datasets
+    from ..winogrande.winogrande_5shot_ll_9d81d7 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
+    from ..bbh.bbh_gen_0a5495 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
+    from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
+    from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
diff --git a/configs/datasets/collections/chat_core.py b/configs/datasets/collections/chat_core.py
new file mode 100644
index 000000000..2b64f5b62
--- /dev/null
+++ b/configs/datasets/collections/chat_core.py
@@ -0,0 +1,20 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_01cf41 import nq_datasets
+    from ..race.race_gen_69ee4f import race_datasets
+    from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_265cce import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
diff --git a/configs/datasets/gsm8k/gsm8k_agent_gen_be1606.py b/configs/datasets/gsm8k/deprecated_gsm8k_agent_gen_be1606.py
similarity index 100%
rename from configs/datasets/gsm8k/gsm8k_agent_gen_be1606.py
rename to configs/datasets/gsm8k/deprecated_gsm8k_agent_gen_be1606.py
diff --git a/configs/datasets/gsm8k/gsm8k_gen_ee684f.py b/configs/datasets/gsm8k/gsm8k_gen_ee684f.py
new file mode 100644
index 000000000..125b229b3
--- /dev/null
+++ b/configs/datasets/gsm8k/gsm8k_gen_ee684f.py
@@ -0,0 +1,88 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='''\
+Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
+Let's think step by step
+Answer:
+Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
+For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
+Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
+However, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.
+They also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.
+And they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.
+So Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.
+They want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75
+They will need to plan to study 4 days to allow for all the time they need.
+The answer is 4
+
+Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?
+Let's think step by step
+Answer:
+Mark's team scores 25 2 pointers, meaning they scored 25*2 = 50 points in 2 pointers.
+His team also scores 6 3 pointers, meaning they scored 8*3 = 24 points in 3 pointers
+They scored 10 free throws, and free throws count as one point so they scored 10*1 = 10 points in free throws.
+All together his team scored 50+24+10 = 84 points
+Mark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2 = 100 points in 2 pointers.
+His opponents scored half his team's number of 3 pointers, meaning they scored 24/2 = 12 points in 3 pointers.
+They also scored half Mark's team's points in free throws, meaning they scored 10/2 = 5 points in free throws.
+All together Mark's opponents scored 100+12+5 = 117 points
+The total score for the game is both team's scores added together, so it is 84+117 = 201 points
+The answer is 201
+
+Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?
+Let's think step by step
+Answer:
+When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24
+The total number of marbles she'll have is 60+24 = 84
+If Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.
+If Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.
+The total number of frisbees she'll have will increase to 30+12 = 42
+Bella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards
+If she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.
+The total number of deck cards she'll have is 10+4 = 14
+Together, Bella will have a total of 14+42+84 = 140 items
+The answer is 140
+
+Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?
+Let's think step by step
+Answer:
+For the first three baskets, the number of apples and oranges in one basket is 9+15 = 24
+In total, together with bananas, the number of fruits in one basket is 24+14 = 38 for the first three baskets.
+Since there are three baskets each having 38 fruits, there are 3*38 = 114 fruits in the first three baskets.
+The number of apples in the fourth basket is 9-2 = 7
+There are also 15-2 = 13 oranges in the fourth basket
+The combined number of oranges and apples in the fourth basket is 13+7 = 20
+The fourth basket also contains 14-2 = 12 bananas.
+In total, the fourth basket has 20+12 = 32 fruits.
+The four baskets together have 32+114 = 146 fruits.
+The answer is 146
+
+Question: {question}
+Let's think step by step
+Answer:
+'''),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=Gsm8kEvaluator),
+    pred_postprocessor=dict(type=gsm8k_postprocess),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='./data/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
diff --git a/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py b/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
new file mode 100644
index 000000000..e80859ea8
--- /dev/null
+++ b/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import hellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+hellaswag_reader_cfg = dict(
+    input_columns=["ctx", "A", "B", "C", "D"],
+    output_column="label",
+    train_split="train",
+    test_split="val",
+)
+
+hellaswag_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt=f"{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?"),
+                dict(role="BOT", prompt="{label}\n"),
+            ]
+        ),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role="HUMAN", prompt="Continue the following text without adding any additional information or formatting:\n"),
+                "</E>",
+            ],
+            round=[
+                dict(role="HUMAN", prompt=f"{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?"),
+                dict(role="BOT", prompt="{label}\n"),
+            ],
+        ),
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
+    inferencer=dict(type=GenInferencer),
+)
+
+hellaswag_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type=first_option_postprocess, options="ABCD"),
+)
+
+hellaswag_datasets = [
+    dict(
+        abbr="hellaswag",
+        type=hellaswagDatasetwithICE,
+        path="./data/hellaswag/",
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
diff --git a/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py b/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
new file mode 100644
index 000000000..48877dd69
--- /dev/null
+++ b/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import hellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+hellaswag_reader_cfg = dict(
+    input_columns=["ctx", "A", "B", "C", "D"],
+    output_column="label",
+    train_split="train",
+    test_split="val",
+)
+
+hint = "Continue the following text without adding any additional information or formatting:"
+question_and_options = "{ctx}\nA) {A}\nB) {B}\nC) {C}\nD) {D}\nWhat is the right option?"
+hellaswag_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={answer: f'{question_and_options}\n{answer}\n' for answer in ["A", "B", "C", "D"]},
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={answer: f"{hint}\n</E>{question_and_options}\n{answer}" for answer in ["A", "B", "C", "D"]},
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
+    inferencer=dict(type=PPLInferencer),
+)
+
+hellaswag_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+hellaswag_datasets = [
+    dict(
+        abbr="hellaswag",
+        type=hellaswagDatasetwithICE,
+        path="./data/hellaswag/",
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
diff --git a/configs/datasets/humaneval/humaneval_gen_d2537e.py b/configs/datasets/humaneval/humaneval_gen_d2537e.py
new file mode 100644
index 000000000..a416ee322
--- /dev/null
+++ b/configs/datasets/humaneval/humaneval_gen_d2537e.py
@@ -0,0 +1,33 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
diff --git a/configs/datasets/math/math_agent_evaluatorv2_gen_861b4f.py b/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py
similarity index 100%
rename from configs/datasets/math/math_agent_evaluatorv2_gen_861b4f.py
rename to configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py
diff --git a/configs/datasets/math/math_evaluatorv2_gen_265cce.py b/configs/datasets/math/math_evaluatorv2_gen_265cce.py
index e3f8ff733..61babee6e 100644
--- a/configs/datasets/math/math_evaluatorv2_gen_265cce.py
+++ b/configs/datasets/math/math_evaluatorv2_gen_265cce.py
@@ -9,46 +9,14 @@
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(round=[
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"
-            ),
+            dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"),
+            dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"),
+            dict(role="BOT", prompt="We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"),
+            dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"),
+            dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"),
             dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
         ])),
     retriever=dict(type=ZeroRetriever),
@@ -56,9 +24,7 @@
 
 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(
-        type=MATHEvaluator,
-        version='v2'),
+    evaluator=dict(type=MATHEvaluator, version='v2'),
     pred_postprocessor=dict(type=math_postprocess_v2))
 
 math_datasets = [
diff --git a/configs/datasets/math/math_evaluatorv2_gen_9d2049.py b/configs/datasets/math/math_evaluatorv2_gen_9d2049.py
new file mode 100644
index 000000000..e777e1e30
--- /dev/null
+++ b/configs/datasets/math/math_evaluatorv2_gen_9d2049.py
@@ -0,0 +1,56 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""\
+Problem:
+Find the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.
+Solution:
+The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
+Final Answer: The final answer is $[2,5)$. I hope it is correct.
+
+Problem:
+If $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$
+Solution:
+We have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
+Final Answer: The final answer is $24$. I hope it is correct.
+
+Problem:
+Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
+Solution:
+If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}
+Final Answer: The final answer is $16$. I hope it is correct.
+
+Problem:
+If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
+Solution:
+If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
+Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
+
+Problem:
+{problem}
+Solution:"""
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
diff --git a/configs/datasets/math/math_gen_0957ff.py b/configs/datasets/math/math_gen_0957ff.py
index 1b8561d23..9065312c5 100644
--- a/configs/datasets/math/math_gen_0957ff.py
+++ b/configs/datasets/math/math_gen_0957ff.py
@@ -9,46 +9,14 @@
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(round=[
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nFind the domain of the expression $\\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{16} \end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf the system of equations: \\begin{align*} 6x-4y&=a,\\\\ 6y-9x &=b. \end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.\n"
-            ),
+            dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}\nSolution:"),
+            dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$\nSolution:"),
+            dict(role="BOT", prompt="We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"),
+            dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{16} \end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{align*} 6x-4y&=a,\\\\ 6y-9x &=b. \end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.\nSolution:"),
+            dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.\n"),
             dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
         ])),
     retriever=dict(type=ZeroRetriever),
diff --git a/configs/datasets/math/math_gen_1ed9c2.py b/configs/datasets/math/math_gen_1ed9c2.py
index 18f45aa30..9af377729 100644
--- a/configs/datasets/math/math_gen_1ed9c2.py
+++ b/configs/datasets/math/math_gen_1ed9c2.py
@@ -9,46 +9,14 @@
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(round=[
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n"
-            ),
+            dict(role="HUMAN", prompt="Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:"),
+            dict(role="BOT", prompt="Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:"),
+            dict(role="BOT", prompt="The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:"),
+            dict(role="BOT", prompt="The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:"),
+            dict(role="BOT", prompt="We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n"),
             dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
         ])),
     retriever=dict(type=ZeroRetriever),
diff --git a/configs/datasets/math/math_gen_265cce.py b/configs/datasets/math/math_gen_265cce.py
index 2312fc619..7cd51a98c 100644
--- a/configs/datasets/math/math_gen_265cce.py
+++ b/configs/datasets/math/math_gen_265cce.py
@@ -9,46 +9,14 @@
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(round=[
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
-            ),
-            dict(
-                role="HUMAN",
-                prompt=
-                "Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
-            ),
-            dict(
-                role="BOT",
-                prompt=
-                "If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"
-            ),
+            dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"),
+            dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"),
+            dict(role="BOT", prompt="We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"),
+            dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"),
+            dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"),
             dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
         ])),
     retriever=dict(type=ZeroRetriever),
diff --git a/configs/datasets/math/math_intern_evaluator_gen_265cce.py b/configs/datasets/math/math_intern_evaluator_gen_265cce.py
new file mode 100644
index 000000000..760757cb9
--- /dev/null
+++ b/configs/datasets/math/math_intern_evaluator_gen_265cce.py
@@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHInternEvaluator, math_intern_postprocess
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"),
+            dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"),
+            dict(role="BOT", prompt="We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"),
+            dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"),
+            dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHInternEvaluator), pred_postprocessor=dict(type=math_intern_postprocess))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
diff --git a/configs/datasets/mbpp/mbpp_gen_1e1056.py b/configs/datasets/mbpp/mbpp_gen_1e1056.py
index 2add7a61f..d560fc33d 100644
--- a/configs/datasets/mbpp/mbpp_gen_1e1056.py
+++ b/configs/datasets/mbpp/mbpp_gen_1e1056.py
@@ -3,62 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset,
-        abbr='mbpp',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_gen_5d6316.py b/configs/datasets/mbpp/mbpp_gen_5d6316.py
index 53c9e29c5..2224d2fb7 100644
--- a/configs/datasets/mbpp/mbpp_gen_5d6316.py
+++ b/configs/datasets/mbpp/mbpp_gen_5d6316.py
@@ -3,19 +3,18 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator2
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 # This prompt is used for WizardLMCode series
 # You can use other config file for basic 3-shot generation
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template=dict(round=[
-            dict(
-                role='HUMAN',
-                prompt=
-                """Below is an instruction that describes a task. Write a response that appropriately completes the request.
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
 ### Instruction:
 Create a Python script for this problem:
@@ -24,19 +23,24 @@
 Test examples:
 {test_list}
 
-### Response:"""),
-        ])),
+### Response:""",
+                ),
+            ]
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset,
-        abbr='mbpp',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_gen_6590b0.py b/configs/datasets/mbpp/mbpp_gen_6590b0.py
index ba9574db9..c515c7224 100644
--- a/configs/datasets/mbpp/mbpp_gen_6590b0.py
+++ b/configs/datasets/mbpp/mbpp_gen_6590b0.py
@@ -3,25 +3,26 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template=
-        "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n[BEGIN]\n"),
+        template="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n[BEGIN]\n",
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator))
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset,
-        abbr='mbpp',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_gen_78c1bc.py b/configs/datasets/mbpp/mbpp_gen_78c1bc.py
index f69ba7354..d228ad60c 100644
--- a/configs/datasets/mbpp/mbpp_gen_78c1bc.py
+++ b/configs/datasets/mbpp/mbpp_gen_78c1bc.py
@@ -3,62 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer))
+    inferencer=dict(type=GenInferencer),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset,
-        abbr='mbpp',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py b/configs/datasets/mbpp/mbpp_gen_caa7ab.py
index 9c24f7ac7..5a3d3709a 100644
--- a/configs/datasets/mbpp/mbpp_gen_caa7ab.py
+++ b/configs/datasets/mbpp/mbpp_gen_caa7ab.py
@@ -3,63 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset, MBPPEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)\n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result\n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums\n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
-                ),
-                dict(role="BOT", prompt="[BEGIN]\n"),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"),
+                dict(role="BOT", prompt="[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)\n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"),
+                dict(role="BOT", prompt="[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result\n[DONE] \n\n "),
 
-            ], )),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"),
+                dict(role="BOT", prompt="[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums\n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"),
+                dict(role="BOT", prompt="[BEGIN]\n"),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset,
-        abbr='mbpp',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py
index 1d3f66117..5b3366e56 100644
--- a/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py
+++ b/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py
@@ -3,62 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_column')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset_V2,
-        abbr='mbpp_passk',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp_passk",
+        path="./data/mbpp/mbpp.jsonl",
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py
index 53fad6414..4a3f36ea8 100644
--- a/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py
+++ b/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py
@@ -5,63 +5,41 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
 
-mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_column')
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
 
 mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
 
 mbpp_datasets = [
     dict(
         type=MBPPDataset_V2,
-        abbr='mbpp_repeat10',
-        path='./data/mbpp/mbpp.jsonl',
+        abbr="mbpp_repeat10",
+        path="./data/mbpp/mbpp.jsonl",
         num_repeats=10,
         reader_cfg=mbpp_reader_cfg,
         infer_cfg=mbpp_infer_cfg,
-        eval_cfg=mbpp_eval_cfg)
+        eval_cfg=mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py b/configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py
index 12634a484..bc8d63268 100644
--- a/configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py
+++ b/configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py
@@ -3,62 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
 
-sanitized_mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_list_2')
+sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
 
 sanitized_mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n",),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n ",),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n",),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n ",),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n",),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n ",),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n",),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
 
 sanitized_mbpp_datasets = [
     dict(
         type=SanitizedMBPPDataset,
-        abbr='sanitized_mbpp',
-        path='./data/mbpp/sanitized-mbpp.jsonl',
+        abbr="sanitized_mbpp",
+        path="./data/mbpp/sanitized-mbpp.jsonl",
         reader_cfg=sanitized_mbpp_reader_cfg,
         infer_cfg=sanitized_mbpp_infer_cfg,
-        eval_cfg=sanitized_mbpp_eval_cfg)
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_cb43ef.py b/configs/datasets/mbpp/sanitized_mbpp_gen_cb43ef.py
new file mode 100644
index 000000000..2fecc29a7
--- /dev/null
+++ b/configs/datasets/mbpp/sanitized_mbpp_gen_cb43ef.py
@@ -0,0 +1,81 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
+
+sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
+
+sanitized_mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='''\
+You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:
+
+assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
+assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
+assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)
+
+[BEGIN]
+ 'def similar_elements(test_tup1, test_tup2):
+  res = tuple(set(test_tup1) & set(test_tup2))
+  return (res)'
+[DONE]
+
+
+
+You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:
+
+assert is_not_prime(2) == False
+assert is_not_prime(10) == True
+assert is_not_prime(35) == True
+
+[BEGIN]
+ 'import math
+def is_not_prime(n):
+    result = False
+    for i in range(2,int(math.sqrt(n)) + 1):
+        if n % i == 0:
+            result = True
+    return result'
+[DONE]
+
+
+
+You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:
+
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]
+assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]
+
+[BEGIN]
+ 'import heapq as hq
+def heap_queue_largest(nums,n):
+  largest_nums = hq.nlargest(n, nums)
+  return largest_nums'
+[DONE]
+
+
+
+You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:
+
+{test_list}
+
+[BEGIN]
+'''
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
+
+sanitized_mbpp_datasets = [
+    dict(
+        type=SanitizedMBPPDataset,
+        abbr="sanitized_mbpp",
+        path="./data/mbpp/sanitized-mbpp.jsonl",
+        reader_cfg=sanitized_mbpp_reader_cfg,
+        infer_cfg=sanitized_mbpp_infer_cfg,
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
+]
diff --git a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py
index 26250996f..0d30db03b 100644
--- a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py
+++ b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py
@@ -3,62 +3,40 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
 
-sanitized_mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_column')
+sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
 
 sanitized_mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
 
 sanitized_mbpp_datasets = [
     dict(
         type=SanitizedMBPPDataset,
-        abbr='sanitized_mbpp_passk',
-        path='./data/mbpp/sanitized-mbpp.jsonl',
+        abbr="sanitized_mbpp_passk",
+        path="./data/mbpp/sanitized-mbpp.jsonl",
         reader_cfg=sanitized_mbpp_reader_cfg,
         infer_cfg=sanitized_mbpp_infer_cfg,
-        eval_cfg=sanitized_mbpp_eval_cfg)
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py
index a4382c9fe..ccdcaedcf 100644
--- a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py
+++ b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py
@@ -3,63 +3,41 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
 
-sanitized_mbpp_reader_cfg = dict(
-    input_columns=['text', 'test_list'], output_column='test_column')
+sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
 
 sanitized_mbpp_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
-                ),
-                dict(
-                    role="BOT",
-                    prompt=
-                    "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "
-                ),
-                dict(
-                    role="HUMAN",
-                    prompt=
-                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"
-                ),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
                 dict(role="BOT", prompt="[BEGIN]\n"),
-            ], )),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
 sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
 
 sanitized_mbpp_datasets = [
     dict(
         type=SanitizedMBPPDataset,
-        abbr='sanitized_mbpp_repeat10',
-        path='./data/mbpp/sanitized-mbpp.jsonl',
+        abbr="sanitized_mbpp_repeat10",
+        path="./data/mbpp/sanitized-mbpp.jsonl",
         num_repeats=10,
         reader_cfg=sanitized_mbpp_reader_cfg,
         infer_cfg=sanitized_mbpp_infer_cfg,
-        eval_cfg=sanitized_mbpp_eval_cfg)
+        eval_cfg=sanitized_mbpp_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/mmlu/mmlu_gen_4d595a.py b/configs/datasets/mmlu/mmlu_gen_4d595a.py
index 6b81299fe..dd83cbc48 100644
--- a/configs/datasets/mmlu/mmlu_gen_4d595a.py
+++ b/configs/datasets/mmlu/mmlu_gen_4d595a.py
@@ -3,7 +3,7 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import MMLUDataset
-from opencompass.utils.text_postprocessors import first_capital_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
 
 # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
 # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
@@ -95,8 +95,7 @@
                 round=[
                     dict(
                         role="HUMAN",
-                        prompt=
-                        f"{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: "
+                        prompt=f"{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: "
                     ),
                 ],
             ),
@@ -108,7 +107,7 @@
 
     mmlu_eval_cfg = dict(
         evaluator=dict(type=AccEvaluator),
-        pred_postprocessor=dict(type=first_capital_postprocess))
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
 
     mmlu_datasets.append(
         dict(
diff --git a/configs/datasets/mmlu/mmlu_ppl_ac766d.py b/configs/datasets/mmlu/mmlu_ppl_ac766d.py
index f0473eb46..9d824339b 100644
--- a/configs/datasets/mmlu/mmlu_ppl_ac766d.py
+++ b/configs/datasets/mmlu/mmlu_ppl_ac766d.py
@@ -75,22 +75,15 @@
 mmlu_datasets = []
 for _name in mmlu_all_sets:
     _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
+    question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
     mmlu_infer_cfg = dict(
         ice_template=dict(
             type=PromptTemplate,
-            template={
-                opt:
-                f"{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n"
-                for opt in ["A", "B", "C", "D"]
-            },
+            template={opt: f"{question_overall}\nAnswer: {opt}\n" for opt in ["A", "B", "C", "D"]},
         ),
         prompt_template=dict(
             type=PromptTemplate,
-            template={
-                opt:
-                f"{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}"
-                for opt in ["A", "B", "C", "D"]
-            },
+            template={opt: f"{_hint}</E>{question_overall}\nAnswer: {opt}" for opt in ["A", "B", "C", "D"]},
             ice_token="</E>",
         ),
         retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
diff --git a/configs/datasets/nq/nq_open_1shot_gen_01cf41.py b/configs/datasets/nq/nq_open_1shot_gen_01cf41.py
new file mode 100644
index 000000000..a8f9810a8
--- /dev/null
+++ b/configs/datasets/nq/nq_open_1shot_gen_01cf41.py
@@ -0,0 +1,61 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever,  RandomRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NQOpenDataset, NQEvaluator
+
+nq_datasets = []
+for k in [1]:
+    nq_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        nq_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        nq_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin="</E>",
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
+
+    nq_datasets.append(
+        dict(
+            type=NQOpenDataset,
+            abbr=f'nq_open_{k}shot',
+            path='./data/nq-open/',
+            reader_cfg=nq_reader_cfg,
+            infer_cfg=nq_infer_cfg,
+            eval_cfg=nq_eval_cfg)
+        )
diff --git a/configs/datasets/nq/nq_open_1shot_gen_20a989.py b/configs/datasets/nq/nq_open_1shot_gen_20a989.py
new file mode 100644
index 000000000..54aaa99fd
--- /dev/null
+++ b/configs/datasets/nq/nq_open_1shot_gen_20a989.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NQOpenDataset, NQEvaluator
+
+nq_datasets = []
+for k in [1]:
+    nq_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        nq_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: ',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        nq_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: {answer}.\n',
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='</E>Q: {question}\nA: ',
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
+
+    nq_datasets.append(
+        dict(
+            type=NQOpenDataset,
+            abbr=f'nq_open_{k}shot',
+            path='./data/nq-open/',
+            reader_cfg=nq_reader_cfg,
+            infer_cfg=nq_infer_cfg,
+            eval_cfg=nq_eval_cfg)
+        )
diff --git a/configs/datasets/race/race_ppl_abed12.py b/configs/datasets/race/race_ppl_abed12.py
index 5adcec1c1..d64c0cc4f 100644
--- a/configs/datasets/race/race_ppl_abed12.py
+++ b/configs/datasets/race/race_ppl_abed12.py
@@ -11,19 +11,12 @@
     test_split="test"
 )
 
+hint = "Read the article, and answer the question by replying A, B, C or D."
+question_and_options = "{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
 race_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template={
-            'A':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: A',
-            'B':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: B',
-            'C':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: C',
-            'D':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: D',
-        }),
+        template={answer: hint + '\n\n' + question_and_options + '\n\nAnswer: ' + answer for answer in ['A', 'B', 'C', 'D']}),
     retriever=dict(type=ZeroRetriever),
     inferencer=dict(type=PPLInferencer))
 
diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py
new file mode 100644
index 000000000..f83977d03
--- /dev/null
+++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: ',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: {answer}.\n',
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='</E>Q: {question}\nA: ',
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADataset_V2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='./data/triviaqa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )
diff --git a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py
new file mode 100644
index 000000000..c8bc858fa
--- /dev/null
+++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py
@@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin="</E>",
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADataset_V2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='./data/triviaqa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )
diff --git a/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py b/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py
new file mode 100644
index 000000000..433f259a9
--- /dev/null
+++ b/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import winograndeDataset_V3
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+winogrande_reader_cfg = dict(
+    input_columns=["opt1", "opt2"],
+    output_column="answer",
+    train_split="train_xs",
+    test_split="dev",
+)
+
+winogrande_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin="</E>",
+            round=[
+                dict(role="HUMAN", prompt="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:"),
+                dict(role="BOT", prompt="{answer}"),
+            ]
+        ),
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer),
+)
+
+winogrande_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type=first_option_postprocess, options="AB"),
+)
+
+winogrande_datasets = [
+    dict(
+        abbr="winogrande",
+        type=winograndeDataset_V3,
+        path="./data/winogrande",
+        reader_cfg=winogrande_reader_cfg,
+        infer_cfg=winogrande_infer_cfg,
+        eval_cfg=winogrande_eval_cfg,
+    )
+]
diff --git a/configs/datasets/winogrande/winogrande_5shot_ll_9d81d7.py b/configs/datasets/winogrande/winogrande_5shot_ll_9d81d7.py
new file mode 100644
index 000000000..53528c34f
--- /dev/null
+++ b/configs/datasets/winogrande/winogrande_5shot_ll_9d81d7.py
@@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import LLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import winograndeDataset_V3
+
+winogrande_reader_cfg = dict(
+    input_columns=['opt1', 'opt2'],
+    output_column='answer',
+    train_split="train_xs",
+    test_split="dev",
+)
+
+question_and_options = "Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}"
+winogrande_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={answer: f"{question_and_options}\nAnswer: {answer}\n" for answer in ["A", "B"]},
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={answer: f"</E>{question_and_options}\nAnswer: {answer}" for answer in ["A", "B"]},
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=LLInferencer),
+)
+winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+winogrande_datasets = [
+    dict(
+        abbr='winogrande',
+        type=winograndeDataset_V3,
+        path='./data/winogrande',
+        reader_cfg=winogrande_reader_cfg,
+        infer_cfg=winogrande_infer_cfg,
+        eval_cfg=winogrande_eval_cfg)
+]
diff --git a/configs/models/gemma/hf_gemma_2b.py b/configs/models/gemma/hf_gemma_2b.py
new file mode 100644
index 000000000..ec731c481
--- /dev/null
+++ b/configs/models/gemma/hf_gemma_2b.py
@@ -0,0 +1,23 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-2b-hf',
+        path="google/gemma-2b",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
diff --git a/configs/models/gemma/hf_gemma_2b_it.py b/configs/models/gemma/hf_gemma_2b_it.py
new file mode 100644
index 000000000..b87243d7b
--- /dev/null
+++ b/configs/models/gemma/hf_gemma_2b_it.py
@@ -0,0 +1,32 @@
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
+        dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
+    ],
+    eos_token_id=151645,
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-2b-it-hf',
+        path="google/gemma-2b-it",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
diff --git a/configs/models/gemma/hf_gemma_7b.py b/configs/models/gemma/hf_gemma_7b.py
new file mode 100644
index 000000000..842ea263c
--- /dev/null
+++ b/configs/models/gemma/hf_gemma_7b.py
@@ -0,0 +1,23 @@
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-7b-hf',
+        path="google/gemma-7b",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
diff --git a/configs/models/gemma/hf_gemma_7b_it.py b/configs/models/gemma/hf_gemma_7b_it.py
new file mode 100644
index 000000000..cc679b2f6
--- /dev/null
+++ b/configs/models/gemma/hf_gemma_7b_it.py
@@ -0,0 +1,33 @@
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
+        dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
+    ],
+    eos_token_id=151645,
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-7b-it-hf',
+        path="google/gemma-7b-it",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        meta_template=_meta_template,
+        min_out_len=1,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
diff --git a/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py b/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py
new file mode 100644
index 000000000..e4a93462e
--- /dev/null
+++ b/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py
@@ -0,0 +1,32 @@
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<用户>'),
+        dict(role="BOT", begin="<AI>", generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='minicpm-2b-dpo-hf',
+        path='openbmb/MiniCPM-2B-dpo-fp32',
+        tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<用户>',
+    )
+]
diff --git a/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py b/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py
new file mode 100644
index 000000000..6a3007470
--- /dev/null
+++ b/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py
@@ -0,0 +1,32 @@
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<用户>'),
+        dict(role="BOT", begin="<AI>", generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='minicpm-2b-sft-hf',
+        path='openbmb/MiniCPM-2B-sft-fp32',
+        tokenizer_path='openbmb/MiniCPM-2B-sft-fp32',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<用户>',
+    )
+]
diff --git a/configs/models/qwen/hf_qwen1_5_14b.py b/configs/models/qwen/hf_qwen1_5_14b.py
index e9d75e4ee..1f6d17097 100644
--- a/configs/models/qwen/hf_qwen1_5_14b.py
+++ b/configs/models/qwen/hf_qwen1_5_14b.py
@@ -20,6 +20,6 @@
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=2, num_procs=1),
     )
 ]
diff --git a/configs/models/qwen/vllm_qwen1_5_14b_chat.py b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
index 960901595..15cd97bbb 100644
--- a/configs/models/qwen/vllm_qwen1_5_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
@@ -4,8 +4,7 @@
 _meta_template = dict(
     round=[
         dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n',
-             generate=True),
+        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
     ],
     eos_token_id=151645,
 )
diff --git a/configs/summarizers/agent_bench.py b/configs/summarizers/agent_bench.py
index 5f1c15160..701292913 100644
--- a/configs/summarizers/agent_bench.py
+++ b/configs/summarizers/agent_bench.py
@@ -5,101 +5,27 @@
     from .groups.plugineval import plugineval_summary_groups
 
 agent_summary_groups = [
-    dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
-    dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
-    dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
-    dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
-    dict(
-        name='agent',
-        subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
-        weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
-    )
+    # dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    # dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    # dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    # dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    # dict(name='agent', subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'], weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}),
+    dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
+    dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
+    dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
+    dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
+    dict(name='agent', subsets=['agent_cn', 'agent_en']),
 ]
 
 summarizer = dict(
     dataset_abbrs=[
-        # 'agent',
-        # 'math_acc_1_and_fill_in_blank-native',
-        # 'math_perf_4_and_fill_in_blank-native',
-        # # '######## MathBench-Agent Accuracy ########', # category
-        # 'math_acc_1_and_fill_in_blank-agent',
-        # 'math_perf_4_and_fill_in_blank-agent',
-        # # '######## CIBench Template ########', # category
-        # 'cibench_template:executable',
-        # 'cibench_template:numeric_correct',
-        # 'cibench_template:text_score',
-        # 'cibench_template:vis_sim',
-        # # '######## CIBench Template Chinese ########', # category
-        # 'cibench_template_cn:executable',
-        # 'cibench_template_cn:numeric_correct',
-        # 'cibench_template_cn:text_score',
-        # 'cibench_template_cn:vis_sim',
-        # # '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
-        # 'cibench_template_wo_nltk:executable',
-        # 'cibench_template_wo_nltk:numeric_correct',
-        # 'cibench_template_wo_nltk:vis_sim',
-        # # '######## CIBench Template Chinese w/o NLTK ########', # category
-        # 'cibench_template_cn_wo_nltk:executable',
-        # 'cibench_template_cn_wo_nltk:numeric_correct',
-        # 'cibench_template_cn_wo_nltk:vis_sim',
-        # '######## T-Eval ########', # category
-        ['plugin_eval-p10', 'naive_average'],
-        ['plugin_eval-p10-instruct_v1', 'format_metric'],
-        ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
-        ['plugin_eval-p10-plan_str_v1', 'f1_score'],
-        ['plugin_eval-p10-plan_json_v1', 'f1_score'],
-        ['plugin_eval-p10-reason_str_v1', 'thought'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
-        ['plugin_eval-p10-retrieve_str_v1', 'name'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
-        ['plugin_eval-p10-understand_str_v1', 'args'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
-        ['plugin_eval-p10-review_str_v1', 'review_quality'],
-
-        ['plugin_eval-p10_zh', 'naive_average'],
-        ['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
-        ['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
-        ['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
-        ['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
-        ['plugin_eval-p10-reason_str_v1_zh', 'thought'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
-        ['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
-        ['plugin_eval-p10-understand_str_v1_zh', 'args'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
-        ['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
-
-        # '######## MUS-T-Eval ########', # category
-        ['plugin_eval-mus-p10', 'naive_average'],
-        ['plugin_eval-mus-p10-instruct_v1', 'format_metric'],
-        ['plugin_eval-mus-p10-instruct_v1', 'args_em_metric'],
-        ['plugin_eval-mus-p10-plan_str_v1', 'f1_score'],
-        ['plugin_eval-mus-p10-plan_json_v1', 'f1_score'],
-        ['plugin_eval-mus-p10-reason_str_v1', 'thought'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'thought'],
-        ['plugin_eval-mus-p10-retrieve_str_v1', 'name'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'name'],
-        ['plugin_eval-mus-p10-understand_str_v1', 'args'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'args'],
-        ['plugin_eval-mus-p10-review_str_v1', 'review_quality'],
-
-        ['plugin_eval-mus-p10_zh', 'naive_average'],
-        ['plugin_eval-mus-p10-instruct_v1_zh', 'format_metric'],
-        ['plugin_eval-mus-p10-instruct_v1_zh', 'args_em_metric'],
-        ['plugin_eval-mus-p10-plan_str_v1_zh', 'f1_score'],
-        ['plugin_eval-mus-p10-plan_json_v1_zh', 'f1_score'],
-        ['plugin_eval-mus-p10-reason_str_v1_zh', 'thought'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
-        ['plugin_eval-mus-p10-retrieve_str_v1_zh', 'name'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'name'],
-        ['plugin_eval-mus-p10-understand_str_v1_zh', 'args'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'args'],
-        ['plugin_eval-mus-p10-review_str_v1_zh', 'review_quality'],
-
-        # ['plugin_eval-p10', 'naive_average'],
-        # ['plugin_eval-mus-p10', 'naive_average'],
-        # ['plugin_eval-p10_zh', 'naive_average'],
-        # ['plugin_eval-mus-p10_zh', 'naive_average'],
+        'agent',
+        'agent_cn',
+        'agent_en',
+        'cibench_template_cn',
+        'cibench_template',
+        'plugin_eval-mus-p10_one_review_zh',
+        'plugin_eval-mus-p10_one_review',
     ],
     summary_groups=sum(
         [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
diff --git a/configs/summarizers/code_passk.py b/configs/summarizers/code_passk.py
index b90d892ef..6852ee0e2 100644
--- a/configs/summarizers/code_passk.py
+++ b/configs/summarizers/code_passk.py
@@ -21,30 +21,22 @@
     {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
     # real add
     {'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
-    {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
+    # {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
+    {'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']},
+    {'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
+    {'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
 ]
 
 summarizer = dict(
     dataset_abbrs=[
         'code',
-        'humaneval_pass@1(greedy)',
-        'humaneval_pass@10',
+        'code_cn',
+        'code_en',
         'humaneval_cn_pass@1(greedy)',
-        'humaneval_cn_pass@10',
         'humaneval_plus_pass@1(greedy)',
-        'humaneval_plus_pass@10',
-        'mbpp_pass@1(greedy)',
-        'mbpp_pass@10',
         'mbpp_cn_pass@1(greedy)',
-        'mbpp_cn_pass@10',
         'sanitized_mbpp_pass@1(greedy)',
-        'sanitized_mbpp_pass@10',
         'humanevalx',
-        'humanevalx-python',
-        'humanevalx-cpp',
-        'humanevalx-go',
-        'humanevalx-java',
-        'humanevalx-js',
     ],
     summary_groups=sum(
         [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
diff --git a/configs/summarizers/compass_knowledge.py b/configs/summarizers/compass_knowledge.py
index c23bddee9..dd46e8d86 100644
--- a/configs/summarizers/compass_knowledge.py
+++ b/configs/summarizers/compass_knowledge.py
@@ -15,21 +15,13 @@
 'compassbench_v1_knowledge-mixed-cloze_en'
 summarizer = dict(
     dataset_abbrs=[
-        'knowledge_acc_1_and_cloze',
-        ['knowledge_cn', 'acc_1'],
-        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
-        'compassbench_v1_knowledge-mixed-cloze_en',
-
         'knowledge_perf_4_and_cloze',
         ['knowledge_cn', 'perf_4'],
+        'compassbench_v1_knowledge-mixed-cloze_en',
         ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
         ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
         ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
         ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
-        'compassbench_v1_knowledge-mixed-cloze_en',
     ],
     summary_groups=compassbench_v1_knowledge_groups
 )
diff --git a/configs/summarizers/compass_math.py b/configs/summarizers/compass_math.py
index f8be85550..4ebc22b52 100644
--- a/configs/summarizers/compass_math.py
+++ b/configs/summarizers/compass_math.py
@@ -1,36 +1,18 @@
 # This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
 
 compassbench_v1_math_groups = [
-    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
-        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-    ]},
-    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
-        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-    ]},
+    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
 ]
 
 
 summarizer = dict(
     dataset_abbrs=[
-        'math_acc_1_and_fill_in_blank',
-        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-
         'math_perf_4_and_fill_in_blank',
+        'math_perf_4_and_fill_in_blank_cn',
+        'math_perf_4_and_fill_in_blank_en',
         ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
         ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
         ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
diff --git a/configs/summarizers/compassbench_v1_language.py b/configs/summarizers/compassbench_v1_language.py
index 167b9d050..2d619e31e 100644
--- a/configs/summarizers/compassbench_v1_language.py
+++ b/configs/summarizers/compassbench_v1_language.py
@@ -34,37 +34,18 @@
 
 summarizer = dict(
     dataset_abbrs=[
-        'language_acc_1_and_non_mcq',
-        'language_en_acc_1_and_non_mcq',
-        'language_zh_acc_1_and_non_mcq',
-        # ['information_retrieval_en', 'score'],
-        # ['information_retrieval_zh', 'score'],
-        ['intention_recognition_en_circular', 'acc_origin'],
-        ['intention_recognition_zh_circular', 'acc_origin'],
-        ['sentiment_analysis_en_circular', 'acc_origin'],
-        ['sentiment_analysis_zh_circular', 'acc_origin'],
-        ['translation', 'score'],
-        ['content_critic_en_circular', 'acc_origin'],
-        ['content_critic_zh_circular', 'acc_origin'],
-        ['content_summarization_en', 'rouge1'],
-        ['content_summarization_zh', 'rouge1'],
-        ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
-        ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
-
         'language_perf_4_and_non_mcq',
-        'language_en_perf_4_and_non_mcq',
         'language_zh_perf_4_and_non_mcq',
-        # ['information_retrieval_en', 'score'],
-        # ['information_retrieval_zh', 'score'],
-        ['intention_recognition_en_circular', 'perf_circular'],
+        'language_en_perf_4_and_non_mcq',
         ['intention_recognition_zh_circular', 'perf_circular'],
-        ['sentiment_analysis_en_circular', 'perf_circular'],
+        ['intention_recognition_en_circular', 'perf_circular'],
         ['sentiment_analysis_zh_circular', 'perf_circular'],
+        ['sentiment_analysis_en_circular', 'perf_circular'],
         ['translation', 'score'],
-        ['content_critic_en_circular', 'perf_circular'],
         ['content_critic_zh_circular', 'perf_circular'],
-        ['content_summarization_en', 'rouge1'],
+        ['content_critic_en_circular', 'perf_circular'],
         ['content_summarization_zh', 'rouge1'],
+        ['content_summarization_en', 'rouge1'],
         ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
         ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
     ],
diff --git a/configs/summarizers/compassbench_v1_reason.py b/configs/summarizers/compassbench_v1_reason.py
index 23576626e..ae855f70e 100644
--- a/configs/summarizers/compassbench_v1_reason.py
+++ b/configs/summarizers/compassbench_v1_reason.py
@@ -12,36 +12,9 @@
 
 summarizer = dict(
     dataset_abbrs=[
-        ['reasonbench', 'acc_origin'],
-        ['reasonbench_cn_circular', 'acc_origin'],
-        ['reasonbench_en_circular', 'acc_origin'],
-
-        ['reasonbench_cn_commonsense_circular', 'acc_origin'],
-        ['reasonbench_cn_abductive_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_circular', 'acc_origin'],
-        ['reasonbench_en_commonsense_circular', 'acc_origin'],
-        ['reasonbench_en_abductive_circular', 'acc_origin'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_circular', 'acc_origin'],
-
-        ['reasonbench_cn_commonsense_circular', 'acc_origin'],
-        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
-        ['reasonbench_en_commonsense_circular', 'acc_origin'],
-        ['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_deer_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
-
-
         ['reasonbench', 'perf_circular'],
         ['reasonbench_cn_circular', 'perf_circular'],
         ['reasonbench_en_circular', 'perf_circular'],
-
         ['reasonbench_cn_commonsense_circular', 'perf_circular'],
         ['reasonbench_cn_abductive_circular', 'perf_circular'],
         ['reasonbench_cn_deductive_circular', 'perf_circular'],
@@ -50,18 +23,6 @@
         ['reasonbench_en_abductive_circular', 'perf_circular'],
         ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
         ['reasonbench_en_inductive_circular', 'perf_circular'],
-
-        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
-        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
-        ['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
-        ['reasonbench_en_commonsense_circular', 'perf_circular'],
-        ['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
-        ['reasonbench_en_inductive_deer_circular', 'perf_circular'],
-        ['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
     ],
     summary_groups=compassbench_v1_reason_groups,
 )
diff --git a/configs/summarizers/groups/plugineval.py b/configs/summarizers/groups/plugineval.py
index 39ecf47b9..c94146962 100644
--- a/configs/summarizers/groups/plugineval.py
+++ b/configs/summarizers/groups/plugineval.py
@@ -39,6 +39,22 @@
             ['plugin_eval-review_str_v1', 'review_quality'],
         ],
     },
+    {
+        'name': 'plugin_eval_one_review',
+        'subsets': [
+            ['plugin_eval-instruct_v1', 'format_metric'],
+            ['plugin_eval-instruct_v1', 'args_em_metric'],
+            ['plugin_eval-plan_str_v1', 'f1_score'],
+            ['plugin_eval-plan_json_v1', 'f1_score'],
+            ['plugin_eval-reason_str_v1', 'thought'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
+            ['plugin_eval-retrieve_str_v1', 'name'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
+            ['plugin_eval-understand_str_v1', 'args'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
+            ['plugin_eval-review_str_v1', 'review_quality'],
+        ]
+    },
     {
         'name': 'plugin_eval',
         'subsets': [
@@ -53,7 +69,6 @@
             ['plugin_eval-understand_str_v1', 'args'],
             ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
             ['plugin_eval-review_str_v1', 'review_quality'],
-            ['copy_plugin_eval-review_str_v1', 'naive_average'],  # a hack for review * 2
         ]
     },
 ]
diff --git a/opencompass/datasets/IFEval/instructions_util.py b/opencompass/datasets/IFEval/instructions_util.py
index d59b8510f..fbccee809 100644
--- a/opencompass/datasets/IFEval/instructions_util.py
+++ b/opencompass/datasets/IFEval/instructions_util.py
@@ -20,16 +20,12 @@
 import random
 import re
 
-try:
-    import immutabledict
-except ImportError:
-    immutabledict = None
 import nltk
 
 WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war', 'figure', 'fee', 'spray', 'user', 'dirt', 'shot', 'task', 'stick', 'friend', 'software', 'promotion', 'interaction', 'surround', 'block', 'purpose', 'practice', 'conflict', 'routine', 'requirement', 'bonus', 'hole', 'state', 'junior', 'sweet', 'catch', 'tear', 'fold', 'wall', 'editor', 'life', 'position', 'pound', 'respect', 'bathroom', 'coat', 'script', 'job', 'teach', 'birth', 'view', 'resolve', 'theme', 'employee', 'doubt', 'market', 'education', 'serve', 'recover', 'tone', 'harm', 'miss', 'union', 'understanding', 'cow', 'river', 'association', 'concept', 'training', 'recipe', 'relationship', 'reserve', 'depression', 'proof', 'hair', 'revenue', 'independent', 'lift', 'assignment', 'temporary', 'amount', 'loss', 'edge', 'track', 'check', 'rope', 'estimate', 'pollution', 'stable', 'message', 'delivery', 'perspective', 'mirror', 'assistant', 'representative', 'witness', 'nature', 'judge', 'fruit', 'tip', 'devil', 'town', 'emergency', 'upper', 'drop', 'stay', 'human', 'neck', 'speaker', 'network', 'sing', 'resist', 'league', 'trip', 'signature', 'lawyer', 'importance', 'gas', 'choice', 'engineer', 'success', 'part', 'external', 'worker', 'simple', 'quarter', 'student', 'heart', 'pass', 'spite', 'shift', 'rough', 'lady', 'grass', 'community', 'garage', 'youth', 'standard', 'skirt', 'promise', 'blind', 'television', 'disease', 'commission', 'positive', 'energy', 'calm', 'presence', 'tune', 'basis', 'preference', 'head', 'common', 'cut', 'somewhere', 'presentation', 'current', 'thought', 'revolution', 'effort', 'master', 'implement', 'republic', 'floor', 'principle', 'stranger', 'shoulder', 'grade', 'button', 'tennis', 'police', 'collection', 'account', 'register', 'glove', 'divide', 'professor', 'chair', 'priority', 'combine', 'peace', 'extension', 'maybe', 'evening', 'frame', 'sister', 'wave', 'code', 'application', 'mouse', 'match', 'counter', 'bottle', 'half', 'cheek', 'resolution', 'back', 'knowledge', 'make', 'discussion', 'screw', 'length', 'accident', 'battle', 'dress', 'knee', 'log', 'package', 'it', 'turn', 'hearing', 'newspaper', 'layer', 'wealth', 'profile', 'imagination', 'answer', 'weekend', 'teacher', 'appearance', 'meet', 'bike', 'rise', 'belt', 'crash', 'bowl', 'equivalent', 'support', 'image', 'poem', 'risk', 'excitement', 'remote', 'secretary', 'public', 'produce', 'plane', 'display', 'money', 'sand', 'situation', 'punch', 'customer', 'title', 'shake', 'mortgage', 'option', 'number', 'pop', 'window', 'extent', 'nothing', 'experience', 'opinion', 'departure', 'dance', 'indication', 'boy', 'material', 'band', 'leader', 'sun', 'beautiful', 'muscle', 'farmer', 'variety', 'fat', 'handle', 'director', 'opportunity', 'calendar', 'outside', 'pace', 'bath', 'fish', 'consequence', 'put', 'owner', 'go', 'doctor', 'information', 'share', 'hurt', 'protection', 'career', 'finance', 'force', 'golf', 'garbage', 'aspect', 'kid', 'food', 'boot', 'milk', 'respond', 'objective', 'reality', 'raw', 'ring', 'mall', 'one', 'impact', 'area', 'news', 'international', 'series', 'impress', 'mother', 'shelter', 'strike', 'loan', 'month', 'seat', 'anything', 'entertainment', 'familiar', 'clue', 'year', 'glad', 'supermarket', 'natural', 'god', 'cost', 'conversation', 'tie', 'ruin', 'comfort', 'earth', 'storm', 'percentage', 'assistance', 'budget', 'strength', 'beginning', 'sleep', 'other', 'young', 'unit', 'fill', 'store', 'desire', 'hide', 'value', 'cup', 'maintenance', 'nurse', 'function', 'tower', 'role', 'class', 'camera', 'database', 'panic', 'nation', 'basket', 'ice', 'art', 'spirit', 'chart', 'exchange', 'feedback', 'statement', 'reputation', 'search', 'hunt', 'exercise', 'nasty', 'notice', 'male', 'yard', 'annual', 'collar', 'date', 'platform', 'plant', 'fortune', 'passion', 'friendship', 'spread', 'cancer', 'ticket', 'attitude', 'island', 'active', 'object', 'service', 'buyer', 'bite', 'card', 'face', 'steak', 'proposal', 'patient', 'heat', 'rule', 'resident', 'broad', 'politics', 'west', 'knife', 'expert', 'girl', 'design', 'salt', 'baseball', 'grab', 'inspection', 'cousin', 'couple', 'magazine', 'cook', 'dependent', 'security', 'chicken', 'version', 'currency', 'ladder', 'scheme', 'kitchen', 'employment', 'local', 'attention', 'manager', 'fact', 'cover', 'sad', 'guard', 'relative', 'county', 'rate', 'lunch', 'program', 'initiative', 'gear', 'bridge', 'breast', 'talk', 'dish', 'guarantee', 'beer', 'vehicle', 'reception', 'woman', 'substance', 'copy', 'lecture', 'advantage', 'park', 'cold', 'death', 'mix', 'hold', 'scale', 'tomorrow', 'blood', 'request', 'green', 'cookie', 'church', 'strip', 'forever', 'beyond', 'debt', 'tackle', 'wash', 'following', 'feel', 'maximum', 'sector', 'sea', 'property', 'economics', 'menu', 'bench', 'try', 'language', 'start', 'call', 'solid', 'address', 'income', 'foot', 'senior', 'honey', 'few', 'mixture', 'cash', 'grocery', 'link', 'map', 'form', 'factor', 'pot', 'model', 'writer', 'farm', 'winter', 'skill', 'anywhere', 'birthday', 'policy', 'release', 'husband', 'lab', 'hurry', 'mail', 'equipment', 'sink', 'pair', 'driver', 'consideration', 'leather', 'skin', 'blue', 'boat', 'sale', 'brick', 'two', 'feed', 'square', 'dot', 'rush', 'dream', 'location', 'afternoon', 'manufacturer', 'control', 'occasion', 'trouble', 'introduction', 'advice', 'bet', 'eat', 'kill', 'category', 'manner', 'office', 'estate', 'pride', 'awareness', 'slip', 'crack', 'client', 'nail', 'shoot', 'membership', 'soft', 'anybody', 'web', 'official', 'individual', 'pizza', 'interest', 'bag', 'spell', 'profession', 'queen', 'deal', 'resource', 'ship', 'guy', 'chocolate', 'joint', 'formal', 'upstairs', 'car', 'resort', 'abroad', 'dealer', 'associate', 'finger', 'surgery', 'comment', 'team', 'detail', 'crazy', 'path', 'tale', 'initial', 'arm', 'radio', 'demand', 'single', 'draw', 'yellow', 'contest', 'piece', 'quote', 'pull', 'commercial', 'shirt', 'contribution', 'cream', 'channel', 'suit', 'discipline', 'instruction', 'concert', 'speech', 'low', 'effective', 'hang', 'scratch', 'industry', 'breakfast', 'lay', 'join', 'metal', 'bedroom', 'minute', 'product', 'rest', 'temperature', 'many', 'give', 'argument', 'print', 'purple', 'laugh', 'health', 'credit', 'investment', 'sell', 'setting', 'lesson', 'egg', 'middle', 'marriage', 'level', 'evidence', 'phrase', 'love', 'self', 'benefit', 'guidance', 'affect', 'you', 'dad', 'anxiety', 'special', 'boyfriend', 'test', 'blank', 'payment', 'soup', 'obligation', 'reply', 'smile', 'deep', 'complaint', 'addition', 'review', 'box', 'towel', 'minor', 'fun', 'soil', 'issue', 'cigarette', 'internet', 'gain', 'tell', 'entry', 'spare', 'incident', 'family', 'refuse', 'branch', 'can', 'pen', 'grandfather', 'constant', 'tank', 'uncle', 'climate', 'ground', 'volume', 'communication', 'kind', 'poet', 'child', 'screen', 'mine', 'quit', 'gene', 'lack', 'charity', 'memory', 'tooth', 'fear', 'mention', 'marketing', 'reveal', 'reason', 'court', 'season', 'freedom', 'land', 'sport', 'audience', 'classroom', 'law', 'hook', 'win', 'carry', 'eye', 'smell', 'distribution', 'research', 'country', 'dare', 'hope', 'whereas', 'stretch', 'library', 'if', 'delay', 'college', 'plastic', 'book', 'present', 'use', 'worry', 'champion', 'goal', 'economy', 'march', 'election', 'reflection', 'midnight', 'slide', 'inflation', 'action', 'challenge', 'guitar', 'coast', 'apple', 'campaign', 'field', 'jacket', 'sense', 'way', 'visual', 'remove', 'weather', 'trash', 'cable', 'regret', 'buddy', 'beach', 'historian', 'courage', 'sympathy', 'truck', 'tension', 'permit', 'nose', 'bed', 'son', 'person', 'base', 'meat', 'usual', 'air', 'meeting', 'worth', 'game', 'independence', 'physical', 'brief', 'play', 'raise', 'board', 'she', 'key', 'writing', 'pick', 'command', 'party', 'yesterday', 'spring', 'candidate', 'physics', 'university', 'concern', 'development', 'change', 'string', 'target', 'instance', 'room', 'bitter', 'bird', 'football', 'normal', 'split', 'impression', 'wood', 'long', 'meaning', 'stock', 'cap', 'leadership', 'media', 'ambition', 'fishing', 'essay', 'salad', 'repair', 'today', 'designer', 'night', 'bank', 'drawing', 'inevitable', 'phase', 'vast', 'chip', 'anger', 'switch', 'cry', 'twist', 'personality', 'attempt', 'storage', 'being', 'preparation', 'bat', 'selection', 'white', 'technology', 'contract', 'side', 'section', 'station', 'till', 'structure', 'tongue', 'taste', 'truth', 'difficulty', 'group', 'limit', 'main', 'move', 'feeling', 'light', 'example', 'mission', 'might', 'wait', 'wheel', 'shop', 'host', 'classic', 'alternative', 'cause', 'agent', 'consist', 'table', 'airline', 'text', 'pool', 'craft', 'range', 'fuel', 'tool', 'partner', 'load', 'entrance', 'deposit', 'hate', 'article', 'video', 'summer', 'feature', 'extreme', 'mobile', 'hospital', 'flight', 'fall', 'pension', 'piano', 'fail', 'result', 'rub', 'gap', 'system', 'report', 'suck', 'ordinary', 'wind', 'nerve', 'ask', 'shine', 'note', 'line', 'mom', 'perception', 'brother', 'reference', 'bend', 'charge', 'treat', 'trick', 'term', 'homework', 'bake', 'bid', 'status', 'project', 'strategy', 'orange', 'let', 'enthusiasm', 'parent', 'concentrate', 'device', 'travel', 'poetry', 'business', 'society', 'kiss', 'end', 'vegetable', 'employ', 'schedule', 'hour', 'brave', 'focus', 'process', 'movie', 'illegal', 'general', 'coffee', 'ad', 'highway', 'chemistry', 'psychology', 'hire', 'bell', 'conference', 'relief', 'show', 'neat', 'funny', 'weight', 'quality', 'club', 'daughter', 'zone', 'touch', 'tonight', 'shock', 'burn', 'excuse', 'name', 'survey', 'landscape', 'advance', 'satisfaction', 'bread', 'disaster', 'item', 'hat', 'prior', 'shopping', 'visit', 'east', 'photo', 'home', 'idea', 'father', 'comparison', 'cat', 'pipe', 'winner', 'count', 'lake', 'fight', 'prize', 'foundation', 'dog', 'keep', 'ideal', 'fan', 'struggle', 'peak', 'safety', 'solution', 'hell', 'conclusion', 'population', 'strain', 'alarm', 'measurement', 'second', 'train', 'race', 'due', 'insurance', 'boss', 'tree', 'monitor', 'sick', 'course', 'drag', 'appointment', 'slice', 'still', 'care', 'patience', 'rich', 'escape', 'emotion', 'royal', 'female', 'childhood', 'government', 'picture', 'will', 'sock', 'big', 'gate', 'oil', 'cross', 'pin', 'improvement', 'championship', 'silly', 'help', 'sky', 'pitch', 'man', 'diamond', 'most', 'transition', 'work', 'science', 'committee', 'moment', 'fix', 'teaching', 'dig', 'specialist', 'complex', 'guide', 'people', 'dead', 'voice', 'original', 'break', 'topic', 'data', 'degree', 'reading', 'recording', 'bunch', 'reach', 'judgment', 'lie', 'regular', 'set', 'painting', 'mode', 'list', 'player', 'bear', 'north', 'wonder', 'carpet', 'heavy', 'officer', 'negative', 'clock', 'unique', 'baby', 'pain', 'assumption', 'disk', 'iron', 'bill', 'drawer', 'look', 'double', 'mistake', 'finish', 'future', 'brilliant', 'contact', 'math', 'rice', 'leave', 'restaurant', 'discount', 'sex', 'virus', 'bit', 'trust', 'event', 'wear', 'juice', 'failure', 'bug', 'context', 'mud', 'whole', 'wrap', 'intention', 'draft', 'pressure', 'cake', 'dark', 'explanation', 'space', 'angle', 'word', 'efficiency', 'management', 'habit', 'star', 'chance', 'finding', 'transportation', 'stand', 'criticism', 'flow', 'door', 'injury', 'insect', 'surprise', 'apartment']  # pylint: disable=line-too-long
 
 # ISO 639-1 codes to language names.
-LANGUAGE_CODES = immutabledict.immutabledict({
+LANGUAGE_CODES = {
     'en': 'English',
     'es': 'Spanish',
     'pt': 'Portuguese',
@@ -60,7 +56,7 @@
     'pa': 'Punjabi',
     'ml': 'Malayalam',
     'fi': 'Finnish',
-})
+}
 
 _ALPHABETS = '([A-Za-z])'
 _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
diff --git a/opencompass/datasets/TheoremQA.py b/opencompass/datasets/TheoremQA.py
index b3beb8921..0f5d3e60e 100644
--- a/opencompass/datasets/TheoremQA.py
+++ b/opencompass/datasets/TheoremQA.py
@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str:
     else:
         text = matches[0].strip().strip('.,?!\"\';:')
         return text
+
+
+def TheoremQA_postprocess_v2(text: str) -> str:
+    prediction = text.strip().strip('\n').split('\n')[-1]
+    tmp = ''
+    for entry in prediction.split(' ')[::-1]:
+        if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
+                ':'):
+            break
+        tmp = entry + ' ' + tmp
+    prediction = tmp.strip().strip('.')
+    return prediction
diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py
index f1879e500..f6a89c49d 100644
--- a/opencompass/datasets/hellaswag.py
+++ b/opencompass/datasets/hellaswag.py
@@ -1,7 +1,7 @@
 import json
 import os.path as osp
 
-from datasets import Dataset
+from datasets import Dataset, DatasetDict
 
 from opencompass.registry import LOAD_DATASET
 
@@ -71,6 +71,32 @@ def load(path):
         return dataset
 
 
+@LOAD_DATASET.register_module()
+class hellaswagDatasetwithICE(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        dataset_dict = DatasetDict()
+        for split, filename in [
+            ['train', 'hellaswag_train_sampled25.jsonl'],
+            ['val', 'hellaswag.jsonl'],
+        ]:
+            dataset = []
+            with open(osp.join(path, filename), 'r', encoding='utf-8') as f:
+                for line in f:
+                    data = json.loads(line)
+                    dataset.append({
+                        'ctx': data['query'].split(': ', 1)[-1],
+                        'A': data['choices'][0],
+                        'B': data['choices'][1],
+                        'C': data['choices'][2],
+                        'D': data['choices'][3],
+                        'label': 'ABCD'[data['gold']],
+                    })
+            dataset_dict[split] = Dataset.from_list(dataset)
+        return dataset_dict
+
+
 class hellaswagDatasetClean(BaseDataset):
 
     # load the contamination annotations of CEval from
diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py
index e9d399594..4bf23f186 100644
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -156,10 +156,13 @@ def humaneval_postprocess_v2(text: str) -> str:
     """This is an advanced version of previous postprocess to handle more
     situations, better to use this one."""
     try:
-        # for chatGLM raw text
-        text = eval(text)
+        # for chatGLM related text
+        eval_text = eval(text)
     except Exception:
         pass
+    else:
+        if isinstance(eval_text, str):
+            text = eval_text
     text = text.lstrip('\n')
     if '```' in text:
         blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
diff --git a/opencompass/datasets/natural_question.py b/opencompass/datasets/natural_question.py
index 832994a88..8ca61aa73 100644
--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
@@ -77,9 +77,10 @@ def score(self, predictions, references):
         cnt = 0
         for pred, cand_ans in zip(processed_predictions, processed_answers):
             detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
-            cnt += int(any([cand == pred for cand in cand_ans]))
-            if int(any([cand == pred for cand in cand_ans])):
-                detail['correct'] = True
+            # is_correct = any([cand == pred for cand in cand_ans])
+            is_correct = any([cand in pred for cand in cand_ans])
+            cnt += int(is_correct)
+            detail['correct'] = is_correct
             details.append(detail)
         score = cnt / len(predictions) * 100
 
diff --git a/opencompass/datasets/winogrande.py b/opencompass/datasets/winogrande.py
index c419fa339..0e897ee5f 100644
--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@@ -1,7 +1,7 @@
 import json
 import os
 
-from datasets import Dataset
+from datasets import Dataset, DatasetDict
 
 from opencompass.registry import LOAD_DATASET
 
@@ -20,12 +20,12 @@ def load(path):
             for line in f:
                 line = json.loads(line)
                 prompt = line['sentence']
-                continue_prompt = prompt.split('_')
+                continue_prompt = prompt.split('_')[1]
                 data_item = {
                     'opt1': prompt.replace('_', line['option1']),
                     'opt2': prompt.replace('_', line['option2']),
                     'answer': line['answer'],
-                    'cont': continue_prompt[1]
+                    'cont': continue_prompt,
                 }
                 dataset_list.append(data_item)
         dataset_list = Dataset.from_list(dataset_list)
@@ -44,13 +44,43 @@ def load(path):
             for line in f:
                 line = json.loads(line)
                 prompt = line['sentence']
+                continue_prompt = prompt.split('_')[1]
                 answer = line['answer']
                 answer = ' AB'[int(answer)] if answer != '' else 'NULL'
                 data_item = {
                     'opt1': prompt.replace('_', line['option1']),
                     'opt2': prompt.replace('_', line['option2']),
                     'answer': answer,
+                    'cont': continue_prompt,
                 }
                 dataset_list.append(data_item)
         dataset_list = Dataset.from_list(dataset_list)
         return dataset_list
+
+
+@LOAD_DATASET.register_module()
+class winograndeDataset_V3(BaseDataset):
+    """Disconnect from Huggingface, winograndeDataset_V2."""
+
+    @staticmethod
+    def load(path):
+        dataset_dict = DatasetDict()
+        for split in ['train_xs', 'dev']:
+            filename = os.path.join(path, f'{split}.jsonl')
+            dataset_list = []
+            with open(filename, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = json.loads(line)
+                    prompt = line['sentence']
+                    continue_prompt = prompt.split('_')[1]
+                    answer = line['answer']
+                    answer = ' AB'[int(answer)] if answer != '' else 'NULL'
+                    data_item = {
+                        'opt1': prompt.replace('_', line['option1']),
+                        'opt2': prompt.replace('_', line['option2']),
+                        'answer': answer,
+                        'cont': continue_prompt,
+                    }
+                    dataset_list.append(data_item)
+            dataset_dict[split] = Dataset.from_list(dataset_list)
+        return dataset_dict
diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py
index ab0f28b34..3d4d6fccd 100644
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -13,6 +13,7 @@
 from .huggingface import HuggingFaceCausalLM  # noqa: F401, F403
 from .huggingface import HuggingFaceChatGLM3  # noqa: F401, F403
 from .intern_model import InternLM  # noqa: F401, F403
+from .krgpt_api import KrGPT  # noqa: F401
 from .lightllm_api import LightllmAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401, F403
 from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
diff --git a/opencompass/models/krgpt_api.py b/opencompass/models/krgpt_api.py
new file mode 100644
index 000000000..e0b38ad74
--- /dev/null
+++ b/opencompass/models/krgpt_api.py
@@ -0,0 +1,134 @@
+import json
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.registry import MODELS
+from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module()
+class KrGPT(BaseAPIModel):
+    is_api: bool = True
+
+    def __init__(
+            self,
+            path: str = 'KrGPT',
+            url: str = 'http://101.69.162.5:9300/v1/chat/completions',
+            max_seq_len: int = 2048,
+            meta_template: Optional[Dict] = None,
+            retry: int = 2,
+            generation_kwargs: Optional[Dict] = dict(),
+    ):
+        super().__init__(
+            path=path,
+            max_seq_len=max_seq_len,
+            meta_template=meta_template,
+            retry=retry,
+            generation_kwargs=generation_kwargs,
+        )
+        self.logger = get_logger()
+        self.url = url
+        self.generation_kwargs = generation_kwargs
+        self.max_out_len = self.generation_kwargs.get('max_new_tokens', 1024)
+
+    def generate(self, inputs: List[str], max_out_len: int,
+                 **kwargs) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [self.max_out_len] * len(inputs)))
+        return results
+
+    def _generate(self,
+                  input: PromptType,
+                  max_out_len: int,
+                  temperature: float = 0.0) -> str:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                msg = {'content': item['prompt']}
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    msg['role'] = 'system'
+                messages.append(msg)
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            header = {'content-type': 'application/json'}
+
+            try:
+                data = dict(messages=messages)
+                raw_response = requests.post(self.url,
+                                             headers=header,
+                                             data=json.dumps(data))
+            except requests.ConnectionError:
+                self.logger.error('Got connection error, retrying...')
+                continue
+            try:
+                response = raw_response.json()
+            except requests.JSONDecodeError:
+                self.logger.error('JsonDecode error, got',
+                                  str(raw_response.content))
+                continue
+            try:
+                return response['choices'][0]['message']['content'].strip()
+            except KeyError:
+                self.logger.error('Find error message in response: ',
+                                  str(response))
+                # if 'error' in response:
+                #     if response['error']['code'] == 'rate_limit_exceeded':
+                #         time.sleep(1)
+                #         continue
+                #     elif response['error']['code'] == 'insufficient_quota':
+                #         self.invalid_keys.add(key)
+                #         self.logger.warn(f'insufficient_quota key: {key}')
+                #         continue
+
+                #     self.logger.error('Find error message in response: ',
+                #                       str(response['error']))
+            max_num_retries += 1
+
+        raise RuntimeError('Calling OpenAI failed after retrying for '
+                           f'{max_num_retries} times. Check the logs for '
+                           'details.')
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 29e48bf3e..3c3fd9111 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -415,6 +415,13 @@ def _generate(self, input: str or PromptList, max_out_len: int,
                     self.logger.error(data)
                 else:
                     return choices[0]['message']['content'].strip()
+            try:
+                match = re.match(r'Error code: \d+ - (.*)', response['data'])
+                err = eval(match.group(1))['error']
+                if err['code'] == 'content_filter' and err['status'] == 400:
+                    return err['message']
+            except Exception:
+                pass
             self.logger.error(response['msg'])
             self.logger.error(response)
             time.sleep(1)
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 20987cb3f..208c92e5d 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -1,4 +1,5 @@
 import datetime
+import json
 import os
 import os.path as osp
 import random
@@ -38,6 +39,7 @@ def __init__(self,
                  task: ConfigDict,
                  aliyun_cfg: ConfigDict,
                  max_num_workers: int = 32,
+                 eval_with_gpu: list = ['plugin_eval'],
                  retry: int = 2,
                  debug: bool = False,
                  lark_bot_url: str = None):
@@ -46,6 +48,8 @@ def __init__(self,
         self.max_num_workers = max_num_workers
         self.retry = retry
 
+        self.eval_with_gpu = eval_with_gpu
+
         logger = get_logger()
         logger.warning(
             'To ensure the integrity of the log results, the log displayed '
@@ -93,19 +97,62 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None):
         num_gpus = task.num_gpus
         task_name = task.name
 
+        is_eval_task = 'OpenICLEval' in task_name
+        if is_eval_task and num_gpus == 0:
+            for check_name in self.eval_with_gpu:
+                if check_name in task_name:
+                    num_gpus = 1
+                    break
+
         # Dump task config to file
         mmengine.mkdir_or_exist('tmp/')
         param_file = f'tmp/{os.getpid()}_params.py'
+        pwd = os.getcwd()
         try:
             cfg.dump(param_file)
+            if self.aliyun_cfg.get('bashrc_path') is not None:
+                # using user's conda env
+                bashrc_path = self.aliyun_cfg['bashrc_path']
+                assert osp.exists(bashrc_path)
+                assert self.aliyun_cfg.get('conda_env_name') is not None
+                conda_env_name = self.aliyun_cfg['conda_env_name']
+                shell_cmd = (f'source {bashrc_path}; '
+                             f'conda activate {conda_env_name}; ')
+            else:
+                # using public conda env
+                # users can also set `python_env_path` to their
+                # own env python path
+                assert self.aliyun_cfg.get('python_env_path') is not None
+                shell_cmd = (
+                    f'export PATH={self.aliyun_cfg["python_env_path"]}/bin:$PATH; '  # noqa: E501
+                    f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
+
+            huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
+            if huggingface_cache is not None:
+                # HUGGINGFACE_HUB_CACHE is a Legacy env variable, here we set
+                # `HF_HUB_CACHE` and `HUGGINGFACE_HUB_CACHE` for bc
+                shell_cmd += f'export HF_HUB_CACHE={huggingface_cache}; '
+                shell_cmd += f'export HUGGINGFACE_HUB_CACHE={huggingface_cache}; '  # noqa: E501
+
+            torch_cache = self.aliyun_cfg.get('torch_cache')
+            if torch_cache is not None:
+                shell_cmd += f'export TORCH_HOME={torch_cache}; '
+
+            hf_offline = self.aliyun_cfg.get('hf_offline', True)
+            if hf_offline:
+                shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; '  # noqa: E501
+
+            http_proxy = self.aliyun_cfg.get('http_proxy')
+            if http_proxy is not None:
+                shell_cmd += f'export http_proxy={http_proxy}; export https_proxy={http_proxy}; '  # noqa: E501
+                shell_cmd += f'export HTTP_PROXY={http_proxy}; export HTTPS_PROXY={http_proxy}; '  # noqa: E501
 
-            # Build up DLC command
-            pwd = os.getcwd()
-            shell_cmd = (
-                f'source {self.aliyun_cfg["bashrc_path"]}; '
-                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
-                f'cd {pwd}; '
-                '{task_cmd}')
+            hf_endpoint = self.aliyun_cfg.get('hf_endpoint')
+            if hf_endpoint is not None:
+                shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; '
+
+            shell_cmd += f'cd {pwd}; '
+            shell_cmd += '{task_cmd}'
 
             tmpl = ('dlc create job'
                     f" --command '{shell_cmd}'"
@@ -114,11 +161,10 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None):
                     f" -c {self.aliyun_cfg['dlc_config_path']}"
                     f" --workspace_id {self.aliyun_cfg['workspace_id']}"
                     ' --worker_count 1'
-                    f' --worker_cpu {max(num_gpus * 6, 8)}'
+                    f' --worker_cpu {max(num_gpus * 8, 32)}'
                     f' --worker_gpu {num_gpus}'
-                    f' --worker_memory {max(num_gpus * 64, 48)}'
-                    f" --worker_image {self.aliyun_cfg['worker_image']}"
-                    ' --interactive')
+                    f' --worker_memory {max(num_gpus * 128, 256)}'
+                    f" --worker_image {self.aliyun_cfg['worker_image']}")
             get_cmd = partial(task.get_command,
                               cfg_path=param_file,
                               template=tmpl)
@@ -139,77 +185,64 @@ def _launch(self, cfg: ConfigDict, random_sleep: Optional[bool] = None):
                 time.sleep(random.randint(0, 10))
 
             def _run_within_retry():
-                try:
-                    process = subprocess.Popen(cmd,
-                                               shell=True,
-                                               text=True,
-                                               stdout=subprocess.PIPE,
-                                               stderr=subprocess.PIPE)
-                    job_id = None
-                    job_allocated = False
-                    job_finished = False
-                    last_end_time = datetime.datetime.now().strftime(
-                        '%Y-%m-%dT%H:%M:%SZ')
-                    while True:
-                        if not job_allocated:
-                            line = process.stdout.readline()
-                            if not line:
-                                break
-                            match = re.search(r'(dlc[0-9a-z]+)', line)
-                            if match and job_id is None:
-                                job_id = match.group(1)
-                            stdout.write(line)
-                            match = re.search(r'Job .* is \[Running\]', line)
-                            if match:
-                                job_allocated = True
-                        else:
-                            try:
-                                process.wait(10)
-                            except subprocess.TimeoutExpired:
-                                pass
-                            else:
-                                job_finished = True
-                            if job_finished:
-                                this_end_time = datetime.datetime.now(
-                                ).strftime('%Y-%m-%dT%H:%M:%SZ')
-                            else:
-                                this_end_time = (
-                                    datetime.datetime.now() -
-                                    datetime.timedelta(seconds=10)
-                                ).strftime('%Y-%m-%dT%H:%M:%SZ')
-                            logs_cmd = (
-                                'dlc logs'
+                output = subprocess.getoutput(cmd)
+                match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
+                if match is None:
+                    raise RuntimeError(
+                        f'Failed to launch dlc job for {output}')
+                else:
+                    job_id = match.group(1)
+                stdout.write(output)
+
+                pod_create_time = None
+                pri_time = None
+                initial_time = datetime.datetime.now()
+                while True:
+                    # 1. Avoid to request dlc too frequently.
+                    # 2. DLC job may not be ready immediately after creation.
+                    for _ in range(5):
+                        time.sleep(2)
+                        try:
+                            job_info = json.loads(
+                                subprocess.getoutput(f'dlc get job {job_id}'))
+                            break
+                        except:  # noqa: E722
+                            pass
+                    else:
+                        raise RuntimeError(
+                            f'Failed to get job info for {job_id}')
+
+                    status = job_info['Status']
+                    if status == 'Failed':
+                        return -1
+                    elif status == 'Succeeded':
+                        return 0
+                    elif status != 'Running':
+                        continue
+
+                    # The pod time could be different from the real time.
+                    # Therefore we need to extract the pod start time from
+                    # the `job_info` and calculate the `start_time` and
+                    # `end_time` in pod.
+                    if pod_create_time is None:
+                        pod_create_time = job_info['GmtCreateTime']
+                        pri_time = pod_create_time
+                        pod_create_time = datetime.datetime.strptime(
+                            pod_create_time, '%Y-%m-%dT%H:%M:%SZ')
+                    elasped_time = datetime.datetime.now() - initial_time
+                    cur_time = (pod_create_time +
+                                elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
+                    logs_cmd = ('dlc logs'
                                 f' {job_id} {job_id}-worker-0'
-                                f' --start_time {last_end_time}'
-                                f' --end_time {this_end_time}'
-                                f" -c {self.aliyun_cfg['dlc_config_path']}")
-                            log_process = subprocess.Popen(
-                                logs_cmd,
-                                shell=True,
-                                text=True,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
-                            log_output, log_err = log_process.communicate()
-                            log_output = '\n'.join(log_output.split('\n')[2:])
-                            stdout.write(log_output)
-                            last_end_time = this_end_time
+                                f" -c {self.aliyun_cfg['dlc_config_path']}"
+                                f' --start_time {pri_time}'
+                                f' --end_time {cur_time}')
+                    log_output = subprocess.getoutput(logs_cmd)
+
+                    if '[WARN] No logs found for the pod' not in log_output:
+                        pri_time = cur_time
+                        stdout.write(log_output)
                         stdout.flush()
-                        if job_finished:
-                            break
-                    process.wait()
-                    return process.returncode
-                finally:
-                    if job_id is not None:
-                        cancel_cmd = (
-                            'dlc stop job'
-                            f' {job_id}'
-                            f" -c {self.aliyun_cfg['dlc_config_path']}"
-                            ' -f')
-                        subprocess.run(cancel_cmd,
-                                       shell=True,
-                                       text=True,
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE)
 
             return_code = _run_within_retry()
             retry = self.retry
diff --git a/tools/prompt_viewer.py b/tools/prompt_viewer.py
index c5de84b75..ed821c5af 100644
--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
@@ -6,7 +6,8 @@
 
 from opencompass.openicl.icl_inferencer import (AgentInferencer,
                                                 ChatInferencer, CLPInferencer,
-                                                GenInferencer, PPLInferencer,
+                                                GenInferencer, LLInferencer,
+                                                PPLInferencer,
                                                 PPLOnlyInferencer)
 from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
 from opencompass.utils import (Menu, build_dataset_from_cfg,
@@ -81,14 +82,15 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
 
     supported_inferencer = [
         AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer,
-        PPLOnlyInferencer, ChatInferencer
+        PPLOnlyInferencer, ChatInferencer, LLInferencer
     ]
     if infer_cfg.inferencer.type not in supported_inferencer:
         print(f'Only {supported_inferencer} are supported')
         return
 
     for idx in range(min(count, len(ice_idx_list))):
-        if issubclass(infer_cfg.inferencer.type, PPLInferencer):
+        if issubclass(infer_cfg.inferencer.type,
+                      (PPLInferencer, LLInferencer)):
             labels = retriever.get_labels(ice_template=ice_template,
                                           prompt_template=prompt_template)
             ice = retriever.generate_ice(ice_idx_list[idx],