From a793df561a60a34b78b05d3f569c4774fb9aee13 Mon Sep 17 00:00:00 2001
From: wangyudong <wangyudong@pjlab.org.cn>
Date: Thu, 14 Dec 2023 09:08:22 +0000
Subject: [PATCH] Bump sft-opencompass-v0.5.2

---
 .../ds1000_compl_service_eval_gen_cbc84f.py   |   2 +-
 .../ds1000/ds1000_service_eval_gen_cbc84f.py  |   2 +-
 .../humaneval/humaneval_gen_6d1cc2.py         |  36 +++
 configs/datasets/mbpp/mbpp_gen_caa7ab.py      |  65 +++++
 .../123B_eval/eval_internlm-chat_123b.py      |  12 +-
 .../eval_internlm-chat_1b_code-only.py        |   4 +-
 .../20B_eval/eval_internlm-chat_20b.py        |  12 +-
 .../70B_eval/eval_internlm-chat_70b.py        |  11 +-
 .../eval_internlm-chat_7b_chatml.py           |  56 ++++
 .../sft_cfg/7B_eval/eval_internlm-chat_7b.py  |  10 +-
 .../7B_eval/eval_internlm-chat_7b_aliyun.py   |   6 +-
 .../7B_eval/eval_internlm-chat_7b_llmv2.py    |   8 +-
 ...eval_internlm-chat_7b_safety_subjective.py |   6 +-
 .../eval_internlm-chat_7b_subjective.py       |   6 +-
 configs/sft_cfg/clusters/aliyun_llm.py        |  18 +-
 configs/sft_cfg/clusters/slurm_llmit2.py      |  23 +-
 .../dataset_collections/code_core_set.py      |   8 +-
 .../code_only_llama_base.py                   |   5 +
 .../code_prompt_engineering.py                |  11 +
 .../dataset_collections/code_with_coreset.py  |   8 +-
 .../medium_chat_sft_v052.py                   |  74 +++++
 .../eval_llama2-internlm-chat_7b_code-only.py |   4 +-
 .../eval_llama2-internlm_7b_code-only.py      |   4 +-
 .../eval_internlm-chat_7b_only-plugineval.py  |   6 +-
 .../eval_internlm-chat_7b_with-plugineval.py  |   6 +-
 .../summarizers/medium_chat_sft_v052.py       | 275 ++++++++++++++++++
 26 files changed, 630 insertions(+), 48 deletions(-)
 create mode 100644 configs/datasets/humaneval/humaneval_gen_6d1cc2.py
 create mode 100644 configs/datasets/mbpp/mbpp_gen_caa7ab.py
 create mode 100644 configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py
 create mode 100644 configs/sft_cfg/dataset_collections/code_prompt_engineering.py
 create mode 100644 configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py
 rename configs/sft_cfg/{7B_eval => plugin_eval}/eval_internlm-chat_7b_only-plugineval.py (89%)
 rename configs/sft_cfg/{7B_eval => plugin_eval}/eval_internlm-chat_7b_with-plugineval.py (89%)
 create mode 100644 configs/sft_cfg/summarizers/medium_chat_sft_v052.py

diff --git a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
index 96f137b61..e19a7d222 100644
--- a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
@@ -34,7 +34,7 @@
             # If the IP address is not accessible, 
             # follow the instructions below to launch a code evaluate service.
             # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb
-            "http://10.140.60.1",  # T cluster, http://10.140.0.133 for S cluster
+            "http://10.140.60.10",  # T cluster, http://10.140.0.133 for S cluster
             # INTERNAL_END
             port=5000
             ),
diff --git a/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py
index c137bf7c8..b38fb662f 100644
--- a/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py
+++ b/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py
@@ -34,7 +34,7 @@
             # If the IP address is not accessible, 
             # follow the instructions below to launch a code evaluate service.
             # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb
-            "http://10.140.60.1",  # T cluster, http://10.140.0.133 for S cluster
+            "http://10.140.60.10",  # T cluster, http://10.140.0.133 for S cluster
             # INTERNAL_END
             port=5000
             ),
diff --git a/configs/datasets/humaneval/humaneval_gen_6d1cc2.py b/configs/datasets/humaneval/humaneval_gen_6d1cc2.py
new file mode 100644
index 000000000..9740039ed
--- /dev/null
+++ b/configs/datasets/humaneval/humaneval_gen_6d1cc2.py
@@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py b/configs/datasets/mbpp/mbpp_gen_caa7ab.py
new file mode 100644
index 000000000..9c24f7ac7
--- /dev/null
+++ b/configs/datasets/mbpp/mbpp_gen_caa7ab.py
@@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset, MBPPEvaluator
+
+mbpp_reader_cfg = dict(
+    input_columns=['text', 'test_list'], output_column='test_list_2')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(role="BOT", prompt="[BEGIN]\n"),
+
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp',
+        path='./data/mbpp/mbpp.jsonl',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg)
+]
diff --git a/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py b/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py
index f85f181bc..f126bff25 100644
--- a/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py
+++ b/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -25,15 +25,17 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', 
     tokenizer_type='v4',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/plato_123b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/plato_123b_8k_sft.py",
     # if got w2w3 miss match error, set w2w3_bug=True
     w2w3_bug=False,
     meta_template=without_meta_template,
     max_out_len=100, 
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
-    batch_size=8, 
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=8, 
         num_procs=8))
diff --git a/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py b/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py
index 01d7e9779..3d8c49437 100644
--- a/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py
+++ b/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py
@@ -16,11 +16,13 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/share_data/yanhang/tokenizes/llama.model',
     tokenizer_type='llama',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
     model_config="/mnt/petrelfs/wangyudong/opencompass/configs/sft_cfg/1B_eval/1B_model_config.py",
     max_out_len=100,
     max_seq_len=2048,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))
diff --git a/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py b/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py
index c560e3fc1..a37fd171f 100644
--- a/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py
+++ b/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py
@@ -5,8 +5,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,13 +24,15 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model',
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/newton_20b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/newton_20b_8k_sft.py",
     meta_template=without_meta_template,
     max_out_len=100, 
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
-    batch_size=8, 
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16', 
     run_cfg=dict(
         num_gpus=4, 
         num_procs=4))
diff --git a/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py b/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py
index 10add7ab8..6bde10d4f 100644
--- a/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py
+++ b/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -25,14 +25,15 @@
     model_type="LLAMA",
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', 
     tokenizer_type='v4',
-    # TODO: support relative path in train_internlm
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/euclid_70b_v2_0_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/euclid_70b_v2_0_sft.py",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(num_gpus=4, num_procs=4),
 )
 
diff --git a/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py b/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py
new file mode 100644
index 000000000..874b572e6
--- /dev/null
+++ b/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py
@@ -0,0 +1,56 @@
+from mmengine.config import read_base
+from opencompass.models.internal import InternLMwithModule
+from copy import deepcopy
+import os.path as osp
+
+with read_base():
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
+    from ..clusters.slurm_llmit2 import infer, eval
+    from ..lark import lark_bot_url
+
+without_meta_template = dict(
+    begin="""""",
+    round=[
+        dict(role='HUMAN', begin='<TOKENS_UNUSED_140>user\n', end='<TOKENS_UNUSED_139>\n'),
+        dict(role='BOT', begin='<TOKENS_UNUSED_140>assistant\n', end='<TOKENS_UNUSED_139>\n', generate=True),
+    ],
+    eos_token_id=103166)
+
+base_dict = dict(
+    abbr=None,
+    path=None,
+    type=InternLMwithModule, 
+    model_type='INTERNLM',
+    tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
+    tokenizer_type='v7',
+    # TODO: add model config in the shared path
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_16k_sft.py",
+    meta_template=without_meta_template,
+    max_out_len=100,
+    # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
+    max_seq_len=8192,
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
+    run_cfg=dict(
+        num_gpus=1, 
+        num_procs=1))
+
+models_path = [
+    '/mnt/petrelfs/share_data/wangyudong/ckpt/20231206/v0.16_dev2_16k/sft_7b_16k_0.16dev2_rc5/10270',
+]
+
+models = []
+
+for model_path in models_path:
+    tmp_model_dict = deepcopy(base_dict)
+    if model_path.endswith('/'):
+        model_path = model_path[:-1]
+    abbr = osp.split(osp.split(model_path)[0])[-1]
+    tmp_model_dict['abbr'] = abbr
+    tmp_model_dict['path'] = model_path
+    models.append(tmp_model_dict)
+
+del models_path, model_path, tmp_model_dict, abbr, base_dict
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py
index 54db35615..a4dddf049 100644
--- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py
+++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,13 +24,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py
index 8722ba29f..66419ea87 100644
--- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py
+++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.aliyun_llm import infer, eval
     from ..lark import lark_bot_url
 
@@ -31,6 +31,8 @@
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py
index b01f01096..8daf815ba 100644
--- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py
+++ b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,7 +24,7 @@
     model_type='origin',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
@@ -35,7 +35,7 @@
         num_procs=1))
 
 models_path = [
-    '/mnt/petrelfs/llmit/ckpt/7b-8k/sft_7b_v0_11/4930',
+    '/mnt/petrelfs/llmit/ckpt/maibao_kaoshi_7_5_ST_8k_v0213rc8/5260',
 ]
 
 models = []
diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py
index e885e4e9a..cfff7da1f 100644
--- a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py
+++ b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py
@@ -35,13 +35,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))
diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py
index 6bd8faf67..fa7e67943 100644
--- a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py
+++ b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py
@@ -32,13 +32,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))
diff --git a/configs/sft_cfg/clusters/aliyun_llm.py b/configs/sft_cfg/clusters/aliyun_llm.py
index 14bc1304e..4d2d32130 100644
--- a/configs/sft_cfg/clusters/aliyun_llm.py
+++ b/configs/sft_cfg/clusters/aliyun_llm.py
@@ -1,4 +1,4 @@
-from opencompass.partitioners import SizePartitioner, NaivePartitioner
+from opencompass.partitioners import SizePartitioner, NaivePartitioner, InferTimePartitioner
 from opencompass.runners import DLCRunner
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 
@@ -12,7 +12,23 @@
     worker_image='master0:5000/eflops/pytorch:py3.6-torch1.8-cuda11.1-rdma5.2-sshd-ubuntu18.04',
 )
 
+# new infer setting, using split strategy, may speed up the infer time
 infer = dict(
+    partitioner=dict(
+        type=InferTimePartitioner,
+        max_task_time=3600,
+        strategy='split'),
+    runner=dict(
+        type=DLCRunner,
+        max_num_workers=64,
+        retry=4,
+        aliyun_cfg=aliyun_cfg,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# origin infer setting
+infer_size = dict(
     partitioner=dict(
         type=SizePartitioner, 
         max_task_size=5000,     # default = 2000
diff --git a/configs/sft_cfg/clusters/slurm_llmit2.py b/configs/sft_cfg/clusters/slurm_llmit2.py
index 97226e206..a837e8aa5 100644
--- a/configs/sft_cfg/clusters/slurm_llmit2.py
+++ b/configs/sft_cfg/clusters/slurm_llmit2.py
@@ -1,10 +1,27 @@
-from opencompass.partitioners import SizePartitioner, NaivePartitioner
+from opencompass.partitioners import SizePartitioner, NaivePartitioner, InferTimePartitioner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 
 # if users want to use other partition, use `-p {PARTITION}` 
 # in the command line can change the partition.
+
+# new infer setting, using split strategy, may speed up the infer time
 infer = dict(
+    partitioner=dict(
+        type=InferTimePartitioner,
+        max_task_time=3600,
+        strategy='split'),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        max_num_workers=64,
+        retry=4,
+        partition='llmit2',
+        quotatype='auto',
+        task=dict(type=OpenICLInferTask)),
+)
+
+# origin infer setting
+infer_size = dict(
     partitioner=dict(
         type=SizePartitioner, 
         max_task_size=5000,     # default = 2000
@@ -15,7 +32,7 @@
         max_num_workers=64,
         retry=4,
         partition='llmit2',
-        quotatype='reserved',
+        quotatype='auto',
         task=dict(type=OpenICLInferTask)),
 )
 
@@ -24,7 +41,7 @@
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llmit2',
-        quotatype='reserved',
+        quotatype='auto',
         max_num_workers=128,
         retry=2,
         task=dict(type=OpenICLEvalTask)),
diff --git a/configs/sft_cfg/dataset_collections/code_core_set.py b/configs/sft_cfg/dataset_collections/code_core_set.py
index b7094cc69..d92ecd4a7 100644
--- a/configs/sft_cfg/dataset_collections/code_core_set.py
+++ b/configs/sft_cfg/dataset_collections/code_core_set.py
@@ -22,8 +22,12 @@
     from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
     from ...datasets.math.math_gen_265cce import math_datasets
     # 代码
-    from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    # the new prompt can get better performance than the old one
+    from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets
+    from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets
+    # old prompt setting
+    # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
     # 综合推理
     from ...datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
     # 理解
diff --git a/configs/sft_cfg/dataset_collections/code_only_llama_base.py b/configs/sft_cfg/dataset_collections/code_only_llama_base.py
index 51b6ab98c..198864e4a 100644
--- a/configs/sft_cfg/dataset_collections/code_only_llama_base.py
+++ b/configs/sft_cfg/dataset_collections/code_only_llama_base.py
@@ -3,4 +3,9 @@
 with read_base():
     from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
     from ...datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ...datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets
+    from ...datasets.clozeTest_maxmin.clozeTest_maxmin_gen_c205fb import maxmin_datasets
+    from ...datasets.py150.py150_gen_38b13d import py150_datasets
+    # humanevalx need docker during eval, not fully support now
+    # from ...datasets.humanevalx.humanevalx_gen_0af626 import humanevalx_datasets
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
diff --git a/configs/sft_cfg/dataset_collections/code_prompt_engineering.py b/configs/sft_cfg/dataset_collections/code_prompt_engineering.py
new file mode 100644
index 000000000..edd003c2c
--- /dev/null
+++ b/configs/sft_cfg/dataset_collections/code_prompt_engineering.py
@@ -0,0 +1,11 @@
+from mmengine.config import read_base
+
+with read_base():
+    # the new prompt can get better performance than the old one
+    from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets
+    from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets
+
+    # old prompt setting
+    # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
diff --git a/configs/sft_cfg/dataset_collections/code_with_coreset.py b/configs/sft_cfg/dataset_collections/code_with_coreset.py
index e2c343669..755cea2a9 100644
--- a/configs/sft_cfg/dataset_collections/code_with_coreset.py
+++ b/configs/sft_cfg/dataset_collections/code_with_coreset.py
@@ -24,8 +24,12 @@
     from ...datasets.math.math_gen_265cce import math_datasets
     from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
     # 代码
-    from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
-    from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # the new prompt can get better performance than the old one
+    from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets
+    from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets
+    # old prompt setting
+    # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
     # 理解
     # 阅读理解
     from ...datasets.obqa.obqa_gen_9069e4 import obqa_datasets
diff --git a/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py b/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py
new file mode 100644
index 000000000..3bc1a33fa
--- /dev/null
+++ b/configs/sft_cfg/dataset_collections/medium_chat_sft_v052.py
@@ -0,0 +1,74 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ...datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from ...datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ...datasets.agieval.agieval_gen_64afd3 import agieval_datasets
+    from ...datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from ...datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
+    # the new prompt can get better performance than the old one
+    from ...datasets.mbpp.mbpp_gen_caa7ab import mbpp_datasets
+    from ...datasets.humaneval.humaneval_gen_6d1cc2 import humaneval_datasets
+    # old prompt setting
+    # from ...datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    # from ...datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ...datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ...datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from ...datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from ...datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from ...datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
+    from ...datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from ...datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
+    from ...datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
+    from ...datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
+    from ...datasets.FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
+    from ...datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from ...datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
+    from ...datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
+    from ...datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ...datasets.lambada.lambada_gen_217e11 import lambada_datasets
+    from ...datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
+    from ...datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from ...datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from ...datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ...datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
+    from ...datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ...datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
+    from ...datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from ...datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ...datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from ...datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
+    from ...datasets.race.race_gen_69ee4f import race_datasets
+    from ...datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ...datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ...datasets.summedits.summedits_gen_315438 import summedits_datasets
+    from ...datasets.math.math_gen_265cce import math_datasets
+    from ...datasets.TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets
+    from ...datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from ...datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ...datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from ...datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
+    from ...datasets.piqa.piqa_gen_1194eb import piqa_datasets
+    from ...datasets.siqa.siqa_gen_e78df3 import siqa_datasets
+    from ...datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from ...datasets.winogrande.winogrande_gen_a9ede5 import winogrande_datasets
+    from ...datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from ...datasets.nq.nq_gen_c788f6 import nq_datasets
+    from ...datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from ...datasets.flores.flores_gen_806ede import flores_datasets
+    # ignore safety datasets as default
+    # from ...datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
+    # from ...datasets.civilcomments.civilcomments_clp_a3c5fd import civilcomments_datasets
+    # from ...datasets.jigsawmultilingual.jigsawmultilingual_clp_fe50d8 import jigsawmultilingual_datasets
+    # from ...datasets.realtoxicprompts.realtoxicprompts_gen_7605e4 import realtoxicprompts_datasets
+    # from ...datasets.truthfulqa.truthfulqa_gen_5ddc62 import truthfulqa_datasets
+    # add subjective QA
+    from ..subjective_qa.subjectiveqav3_gen import subjectiveqav3_datasets
+    # add new code eval dataset
+    from ...datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets
+    from ...datasets.clozeTest_maxmin.clozeTest_maxmin_gen_c205fb import maxmin_datasets
+    from ...datasets.py150.py150_gen_38b13d import py150_datasets
+    # humanevalx need docker during eval, not fully support now
+    # from ...datasets.humanevalx.humanevalx_gen_0af626 import humanevalx_datasets
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
diff --git a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py
index 4cf2a7037..93a75d20b 100644
--- a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py
+++ b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm-chat_7b_code-only.py
@@ -24,8 +24,8 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llama.model',
     tokenizer_type='llama',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/llama2_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/llama2_7b_8k_sft.py",
     max_out_len=100,
     max_seq_len=2048,
     meta_template=meta_template,
diff --git a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py
index e5727e0a2..14f03d1fc 100644
--- a/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py
+++ b/configs/sft_cfg/llama2_7B_code_eval/eval_llama2-internlm_7b_code-only.py
@@ -16,8 +16,8 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llama.model',
     tokenizer_type='llama',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/llama2_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/llama2_7b_8k_sft.py",
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py
similarity index 89%
rename from configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py
rename to configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py
index 516e3e65a..774d1a4bf 100644
--- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_only-plugineval.py
+++ b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_only-plugineval.py
@@ -62,13 +62,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model',
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm/configs/maibao_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/maibao_7b_8k_sft.py",
     meta_template=_without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=4,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=4,
         num_procs=1))
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py
similarity index 89%
rename from configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py
rename to configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py
index c2f3c037b..22956a259 100644
--- a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_with-plugineval.py
+++ b/configs/sft_cfg/plugin_eval/eval_internlm-chat_7b_with-plugineval.py
@@ -62,13 +62,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model',
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_v051/train_internlm/configs/maibao_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/maibao_7b_8k_sft.py",
     meta_template=_without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1,
         num_procs=1))
diff --git a/configs/sft_cfg/summarizers/medium_chat_sft_v052.py b/configs/sft_cfg/summarizers/medium_chat_sft_v052.py
new file mode 100644
index 000000000..9a1e98215
--- /dev/null
+++ b/configs/sft_cfg/summarizers/medium_chat_sft_v052.py
@@ -0,0 +1,275 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ...summarizers.groups.agieval import agieval_summary_groups
+    from ...summarizers.groups.mmlu import mmlu_summary_groups
+    from ...summarizers.groups.cmmlu import cmmlu_summary_groups
+    from ...summarizers.groups.ceval import ceval_summary_groups
+    from ...summarizers.groups.bbh import bbh_summary_groups
+    from ...summarizers.groups.GaokaoBench import GaokaoBench_summary_groups
+    from ...summarizers.groups.flores import flores_summary_groups
+    from ...summarizers.groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
+    from ...summarizers.groups.tydiqa import tydiqa_summary_groups
+    from ...summarizers.groups.xiezhi import xiezhi_summary_groups
+    from ...summarizers.groups.ds1000 import ds1000_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        # '--------- 考试 Exam ---------',  # category
+        # 'Mixed', # subcategory
+        "ceval",
+        'agieval',
+        'mmlu',
+        'mmlu_cn', # placeholder
+        "GaokaoBench",
+        'ARC-c',
+        # '自建考试', # subcategory
+        'compass_exam-senior-high-2023',
+        # '--------- 语言 Language ---------',  # category
+        # '字词释义', # subcategory
+        'WiC',
+        'summedits',
+        # '成语习语', # subcategory
+        'chid-dev',
+        # '语义相似度', # subcategory
+        'afqmc-dev',
+        'bustm-dev',
+        # '指代消解', # subcategory
+        'cluewsc-dev',
+        'WSC',
+        'winogrande',
+        # '翻译', # subcategory
+        'flores_100',
+        # '--------- 知识 Knowledge ---------',  # category
+        # '知识问答', # subcategory
+        'BoolQ',
+        'commonsense_qa',
+        'nq',
+        'triviaqa',
+        # '多语种问答', # subcategory
+        'tydiqa', # placeholder
+        # '--------- 推理 Reasoning ---------',  # category
+        # '文本蕴含', # subcategory
+        'cmnli',
+        'ocnli',
+        'ocnli_fc-dev',
+        'AX_b',
+        'AX_g',
+        'CB',
+        'RTE',
+        # '常识推理', # subcategory
+        'story_cloze',
+        'story_cloze_cn', # placeholder
+        'COPA',
+        'ReCoRD',
+        'hellaswag',
+        'piqa',
+        'siqa',
+        'strategyqa',
+        # '数学推理', # subcategory
+        'math',
+        'math_cn', # placeholder
+        'gsm8k',
+        'gsm8k_cn', # placeholder
+        # '定理应用', # subcategory
+        'TheoremQA',
+        # '代码', # subcategory
+        'openai_humaneval',
+        'mbpp',
+        # '综合推理', # subcategory
+        "bbh",
+        # '--------- 理解 Understanding ---------',  # category
+        # '阅读理解', # subcategory
+        'C3',
+        'CMRC_dev',
+        'DRCD_dev',
+        'MultiRC',
+        'race-middle',
+        'race-high',
+        'openbookqa_fact',
+        # '内容总结', # subcategory
+        'csl_dev',
+        'lcsts',
+        'Xsum',
+        # '内容分析', # subcategory
+        'eprstmt-dev',
+        'lambada',
+        'tnews-dev',
+        # '--------- 安全 Safety ---------',  # category
+        # '偏见', # subcategory
+        'crows_pairs',
+        'crowspairs_cn', # placeholder
+        'civil_comments',
+        # '有害性', # subcategory
+        'jigsaw_multilingual',
+        "allenai_real-toxicity-prompts",
+        # '真实性', # subcategory
+        ('truthful_qa', 'truth'),
+        ('truthful_qa', 'info'),
+        # '--------- ceval 细节 ---------',
+        "ceval-stem",
+        "ceval-social-science",
+        "ceval-humanities",
+        "ceval-other",
+        "ceval-hard",
+        # category
+        'ceval-advanced_mathematics',
+        'ceval-college_chemistry',
+        'ceval-college_physics',
+        'ceval-college_programming',
+        'ceval-computer_architecture',
+        'ceval-computer_network',
+        'ceval-discrete_mathematics',
+        'ceval-electrical_engineer',
+        'ceval-high_school_biology',
+        'ceval-high_school_chemistry',
+        'ceval-high_school_mathematics',
+        'ceval-high_school_physics',
+        'ceval-metrology_engineer',
+        'ceval-middle_school_biology',
+        'ceval-middle_school_chemistry',
+        'ceval-middle_school_mathematics',
+        'ceval-middle_school_physics',
+        'ceval-operating_system',
+        'ceval-probability_and_statistics',
+        'ceval-veterinary_medicine',
+        'ceval-business_administration',
+        'ceval-college_economics',
+        'ceval-education_science',
+        'ceval-high_school_geography',
+        'ceval-high_school_politics',
+        'ceval-mao_zedong_thought',
+        'ceval-marxism',
+        'ceval-middle_school_geography',
+        'ceval-middle_school_politics',
+        'ceval-teacher_qualification',
+        'ceval-art_studies',
+        'ceval-chinese_language_and_literature',
+        'ceval-high_school_chinese',
+        'ceval-high_school_history',
+        'ceval-ideological_and_moral_cultivation',
+        'ceval-law',
+        'ceval-legal_professional',
+        'ceval-logic',
+        'ceval-middle_school_history',
+        'ceval-modern_chinese_history',
+        'ceval-professional_tour_guide',
+        'ceval-accountant',
+        'ceval-basic_medicine',
+        'ceval-civil_servant',
+        'ceval-clinical_medicine',
+        'ceval-environmental_impact_assessment_engineer',
+        'ceval-fire_engineer',
+        'ceval-physician',
+        'ceval-plant_protection',
+        'ceval-sports_science',
+        'ceval-tax_accountant',
+        'ceval-urban_and_rural_planner',
+        # '--------- agieval 细节 ---------',
+        'agieval-chinese',
+        'agieval-english',
+        'agieval-gaokao',
+        # category
+        'agieval-aqua-rat',
+        'agieval-math',
+        'agieval-logiqa-en',
+        'agieval-logiqa-zh',
+        'agieval-jec-qa-kd',
+        'agieval-jec-qa-ca',
+        'agieval-lsat-ar',
+        'agieval-lsat-lr',
+        'agieval-lsat-rc',
+        'agieval-sat-math',
+        'agieval-sat-en',
+        'agieval-sat-en-without-passage',
+        'agieval-gaokao-chinese',
+        'agieval-gaokao-english',
+        'agieval-gaokao-geography',
+        'agieval-gaokao-history',
+        'agieval-gaokao-biology',
+        'agieval-gaokao-chemistry',
+        'agieval-gaokao-physics',
+        'agieval-gaokao-mathqa',
+        'agieval-gaokao-mathcloze',
+        # '--------- mmlu 细节 ---------',
+        'mmlu-humanities',
+        'mmlu-stem',
+        'mmlu-social-science',
+        'mmlu-other',
+        # category
+        'lukaemon_mmlu_abstract_algebra',
+        'lukaemon_mmlu_anatomy',
+        'lukaemon_mmlu_astronomy',
+        'lukaemon_mmlu_business_ethics',
+        'lukaemon_mmlu_clinical_knowledge',
+        'lukaemon_mmlu_college_biology',
+        'lukaemon_mmlu_college_chemistry',
+        'lukaemon_mmlu_college_computer_science',
+        'lukaemon_mmlu_college_mathematics',
+        'lukaemon_mmlu_college_medicine',
+        'lukaemon_mmlu_college_physics',
+        'lukaemon_mmlu_computer_security',
+        'lukaemon_mmlu_conceptual_physics',
+        'lukaemon_mmlu_econometrics',
+        'lukaemon_mmlu_electrical_engineering',
+        'lukaemon_mmlu_elementary_mathematics',
+        'lukaemon_mmlu_formal_logic',
+        'lukaemon_mmlu_global_facts',
+        'lukaemon_mmlu_high_school_biology',
+        'lukaemon_mmlu_high_school_chemistry',
+        'lukaemon_mmlu_high_school_computer_science',
+        'lukaemon_mmlu_high_school_european_history',
+        'lukaemon_mmlu_high_school_geography',
+        'lukaemon_mmlu_high_school_government_and_politics',
+        'lukaemon_mmlu_high_school_macroeconomics',
+        'lukaemon_mmlu_high_school_mathematics',
+        'lukaemon_mmlu_high_school_microeconomics',
+        'lukaemon_mmlu_high_school_physics',
+        'lukaemon_mmlu_high_school_psychology',
+        'lukaemon_mmlu_high_school_statistics',
+        'lukaemon_mmlu_high_school_us_history',
+        'lukaemon_mmlu_high_school_world_history',
+        'lukaemon_mmlu_human_aging',
+        'lukaemon_mmlu_human_sexuality',
+        'lukaemon_mmlu_international_law',
+        'lukaemon_mmlu_jurisprudence',
+        'lukaemon_mmlu_logical_fallacies',
+        'lukaemon_mmlu_machine_learning',
+        'lukaemon_mmlu_management',
+        'lukaemon_mmlu_marketing',
+        'lukaemon_mmlu_medical_genetics',
+        'lukaemon_mmlu_miscellaneous',
+        'lukaemon_mmlu_moral_disputes',
+        'lukaemon_mmlu_moral_scenarios',
+        'lukaemon_mmlu_nutrition',
+        'lukaemon_mmlu_philosophy',
+        'lukaemon_mmlu_prehistory',
+        'lukaemon_mmlu_professional_accounting',
+        'lukaemon_mmlu_professional_law',
+        'lukaemon_mmlu_professional_medicine',
+        'lukaemon_mmlu_professional_psychology',
+        'lukaemon_mmlu_public_relations',
+        'lukaemon_mmlu_security_studies',
+        'lukaemon_mmlu_sociology',
+        'lukaemon_mmlu_us_foreign_policy',
+        'lukaemon_mmlu_virology',
+        'lukaemon_mmlu_world_religions',
+        # new code eval datasets
+        'maxmin',
+        'py150',
+        'ds1000',
+        'ds1000_Pandas', 
+        'ds1000_Numpy', 
+        'ds1000_Tensorflow', 
+        'ds1000_Scipy', 
+        'ds1000_Sklearn', 
+        'ds1000_Pytorch', 
+        'ds1000_Matplotlib'
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+    prompt_db=dict(
+        database_path='configs/datasets/log.json',
+        config_dir='configs/datasets',
+        blacklist='.promptignore'),
+)