Bump sft-opencompass-v0.5.2

open-compass · Dec 14, 2023 · a793df5 · a793df5
1 parent 42c7d51
commit a793df5
Show file tree

Hide file tree

Showing 26 changed files with 630 additions and 48 deletions.
diff --git a/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
@@ -34,7 +34,7 @@
             # If the IP address is not accessible, 
             # follow the instructions below to launch a code evaluate service.
             # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb
-            "http://10.140.60.1",  # T cluster, http://10.140.0.133 for S cluster
+            "http://10.140.60.10",  # T cluster, http://10.140.0.133 for S cluster
             # INTERNAL_END
             port=5000
             ),

diff --git a/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py b/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py
@@ -34,7 +34,7 @@
             # If the IP address is not accessible, 
             # follow the instructions below to launch a code evaluate service.
             # https://aicarrier.feishu.cn/docx/JpLAdWNh9oGC1fxH9Z9cTobLntb
-            "http://10.140.60.1",  # T cluster, http://10.140.0.133 for S cluster
+            "http://10.140.60.10",  # T cluster, http://10.140.0.133 for S cluster
             # INTERNAL_END
             port=5000
             ),

diff --git a/configs/datasets/humaneval/humaneval_gen_6d1cc2.py b/configs/datasets/humaneval/humaneval_gen_6d1cc2.py
@@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py b/configs/datasets/mbpp/mbpp_gen_caa7ab.py
@@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset, MBPPEvaluator
+
+mbpp_reader_cfg = dict(
+    input_columns=['text', 'test_list'], output_column='test_list_2')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(
+                    role="BOT",
+                    prompt=
+                    "[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums\n[DONE] \n\n "
+                ),
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n"
+                ),
+                dict(role="BOT", prompt="[BEGIN]\n"),
+
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp',
+        path='./data/mbpp/mbpp.jsonl',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg)
+]
diff --git a/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py b/configs/sft_cfg/123B_eval/eval_internlm-chat_123b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -25,15 +25,17 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', 
     tokenizer_type='v4',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/plato_123b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/plato_123b_8k_sft.py",
     # if got w2w3 miss match error, set w2w3_bug=True
     w2w3_bug=False,
     meta_template=without_meta_template,
     max_out_len=100, 
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
-    batch_size=8, 
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=8, 
         num_procs=8))

diff --git a/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py b/configs/sft_cfg/1B_eval/eval_internlm-chat_1b_code-only.py
@@ -16,11 +16,13 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/share_data/yanhang/tokenizes/llama.model',
     tokenizer_type='llama',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
     model_config="/mnt/petrelfs/wangyudong/opencompass/configs/sft_cfg/1B_eval/1B_model_config.py",
     max_out_len=100,
     max_seq_len=2048,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))

diff --git a/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py b/configs/sft_cfg/20B_eval/eval_internlm-chat_20b.py
@@ -5,8 +5,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,13 +24,15 @@
     model_type='LLAMA',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model',
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/newton_20b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/newton_20b_8k_sft.py",
     meta_template=without_meta_template,
     max_out_len=100, 
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
-    batch_size=8, 
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16', 
     run_cfg=dict(
         num_gpus=4, 
         num_procs=4))

diff --git a/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py b/configs/sft_cfg/70B_eval/eval_internlm-chat_70b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -25,14 +25,15 @@
     model_type="LLAMA",
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/llamav4.model', 
     tokenizer_type='v4',
-    # TODO: support relative path in train_internlm
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/euclid_70b_v2_0_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/euclid_70b_v2_0_sft.py",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(num_gpus=4, num_procs=4),
 )
 

diff --git a/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py b/configs/sft_cfg/7B_chatml_eval/eval_internlm-chat_7b_chatml.py
@@ -0,0 +1,56 @@
+from mmengine.config import read_base
+from opencompass.models.internal import InternLMwithModule
+from copy import deepcopy
+import os.path as osp
+
+with read_base():
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
+    from ..clusters.slurm_llmit2 import infer, eval
+    from ..lark import lark_bot_url
+
+without_meta_template = dict(
+    begin="""""",
+    round=[
+        dict(role='HUMAN', begin='<TOKENS_UNUSED_140>user\n', end='<TOKENS_UNUSED_139>\n'),
+        dict(role='BOT', begin='<TOKENS_UNUSED_140>assistant\n', end='<TOKENS_UNUSED_139>\n', generate=True),
+    ],
+    eos_token_id=103166)
+
+base_dict = dict(
+    abbr=None,
+    path=None,
+    type=InternLMwithModule, 
+    model_type='INTERNLM',
+    tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
+    tokenizer_type='v7',
+    # TODO: add model config in the shared path
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_16k_sft.py",
+    meta_template=without_meta_template,
+    max_out_len=100,
+    # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
+    max_seq_len=8192,
+    batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
+    run_cfg=dict(
+        num_gpus=1, 
+        num_procs=1))
+
+models_path = [
+    '/mnt/petrelfs/share_data/wangyudong/ckpt/20231206/v0.16_dev2_16k/sft_7b_16k_0.16dev2_rc5/10270',
+]
+
+models = []
+
+for model_path in models_path:
+    tmp_model_dict = deepcopy(base_dict)
+    if model_path.endswith('/'):
+        model_path = model_path[:-1]
+    abbr = osp.split(osp.split(model_path)[0])[-1]
+    tmp_model_dict['abbr'] = abbr
+    tmp_model_dict['path'] = model_path
+    models.append(tmp_model_dict)
+
+del models_path, model_path, tmp_model_dict, abbr, base_dict
diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,13 +24,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))

diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_aliyun.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.aliyun_llm import infer, eval
     from ..lark import lark_bot_url
 
@@ -31,6 +31,8 @@
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))

diff --git a/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py b/configs/sft_cfg/7B_eval/eval_internlm-chat_7b_llmv2.py
@@ -4,8 +4,8 @@
 import os.path as osp
 
 with read_base():
-    from ..dataset_collections.medium_chat_sft_v051 import datasets
-    from ..summarizers.medium_chat_sft_v051 import summarizer
+    from ..dataset_collections.medium_chat_sft_v052 import datasets
+    from ..summarizers.medium_chat_sft_v052 import summarizer
     from ..clusters.slurm_llmit2 import infer, eval
     from ..lark import lark_bot_url
 
@@ -24,7 +24,7 @@
     model_type='origin',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
     meta_template=without_meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
@@ -35,7 +35,7 @@
         num_procs=1))
 
 models_path = [
-    '/mnt/petrelfs/llmit/ckpt/7b-8k/sft_7b_v0_11/4930',
+    '/mnt/petrelfs/llmit/ckpt/maibao_kaoshi_7_5_ST_8k_v0213rc8/5260',
 ]
 
 models = []

diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_safety_subjective.py
@@ -35,13 +35,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))

diff --git a/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py b/configs/sft_cfg/7B_subjective/eval_internlm-chat_7b_subjective.py
@@ -32,13 +32,15 @@
     model_type='INTERNLM',
     tokenizer_path='/mnt/petrelfs/llmit/tokenizers/V7.model', 
     tokenizer_type='v7',
-    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101",
-    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/deliver_1101/train_internlm_deliver_1101/configs/internlm_7b_8k_sft.py",
+    module_path="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm",
+    model_config="/mnt/petrelfs/llmit/code/opencompass_internal/sft_opencompass_v052/train_internlm/configs/internlm_7b_8k_sft.py",
     meta_template=meta_template,
     max_out_len=100,
     # If want to use the full length of the model, set max_seq_len=8192, otherwise can set max_seq_len=2048.
     max_seq_len=8192,
     batch_size=8,
+    # using bf16 may decrease the performance, force set to fp16
+    model_dtype='torch.float16',
     run_cfg=dict(
         num_gpus=1, 
         num_procs=1))