open-compass · tonysy · Aug 22, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/.github/scripts/eval_regression_base.py b/.github/scripts/eval_regression_base.py
@@ -2,48 +2,57 @@
 
 with read_base():
     # choose a list of datasets
-    from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
         gsm8k_datasets  # noqa: F401, E501
-    from ...configs.datasets.race.race_ppl import \
+    from opencompass.configs.datasets.race.race_ppl import \
         race_datasets  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \
+    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
         models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
     # read hf models - chat models
-    from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
         models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
-    from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \
+    from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
         models as vllm_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_2b import \
+    from opencompass.configs.models.gemma.hf_gemma_2b import \
         models as hf_gemma_2b_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_7b import \
+    from opencompass.configs.models.gemma.hf_gemma_7b import \
         models as hf_gemma_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
+        models as hf_internlm2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
+        models as hf_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
         models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
+        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
         models as lmdeploy_internlm2_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
         models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.lmdeploy_llama3_8b import \
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
         models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from ...configs.models.mistral.hf_mistral_7b_v0_2 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
         models as hf_mistral_7b_v0_2_model  # noqa: F401, E501
-    from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \
+    from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
         models as vllm_mistral_7b_v0_2_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
         models as hf_qwen1_5_moe_a2_7b_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen2_0_5b import \
+    from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
         models as hf_qwen2_0_5b_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
         models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_7b import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
         models as lmdeploy_qwen2_7b_model  # noqa: F401, E501
-    from ...configs.models.qwen.vllm_qwen1_5_0_5b import \
+    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
         models as vllm_qwen1_5_0_5b_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_6b import \
+    from opencompass.configs.models.yi.hf_yi_1_5_6b import \
         models as hf_yi_1_5_6b_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_9b import \
+    from opencompass.configs.models.yi.hf_yi_1_5_9b import \
         models as hf_yi_1_5_9b_model  # noqa: F401, E501
-    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
 
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

diff --git a/.github/scripts/eval_regression_chat.py b/.github/scripts/eval_regression_chat.py
@@ -1,70 +1,105 @@
 from mmengine.config import read_base
 
+from opencompass.models import OpenAISDK
+
 with read_base():
     # choose a list of datasets
-    from ...configs.datasets.gsm8k.gsm8k_gen import \
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
         gsm8k_datasets  # noqa: F401, E501
-    from ...configs.datasets.race.race_gen import \
+    from opencompass.configs.datasets.race.race_gen import \
         race_datasets  # noqa: F401, E501
     # read hf models - chat models
-    from ...configs.models.baichuan.hf_baichuan2_7b_chat import \
+    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
         models as hf_baichuan2_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.chatglm.hf_glm4_9b_chat import \
+    from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
         models as hf_glm4_9b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_7b_chat import \
+    from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
         models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \
+    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
         models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from ...configs.models.deepseek.vllm_deepseek_7b_chat import \
+    from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
         models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_2b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_2b_it import \
         models as hf_gemma_2b_it_model  # noqa: F401, E501
-    from ...configs.models.gemma.hf_gemma_7b_it import \
+    from opencompass.configs.models.gemma.hf_gemma_7b_it import \
         models as hf_gemma_7b_it_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
         models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
         models as lmdeploy_internlm2_chat_1_8b_sft_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
         models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
         models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
-    from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \
+    from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
         models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.hf_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
         models as hf_llama3_8b_instruct_model  # noqa: F401, E501
-    from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
         models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
         models as hf_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
+    from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
         models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
         models as hf_minicpm_2b_dpo_fp32_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
         models as hf_minicpm_2b_sft_bf16_model  # noqa: F401, E501
-    from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
+    from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
         models as hf_minicpm_2b_sft_fp32_model  # noqa: F401, E501
-    from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \
+    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
         models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
-    from ...configs.models.phi.hf_phi_3_small_8k_instruct import \
+    from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
         models as hf_phi_3_mini_8k_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \
+    from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
         models as hf_qwen1_5_0_5b_chat_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
         models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
         models as lmdeploy_qwen2_7b_instruct_model  # noqa: F401, E501
-    from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
+    from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
         models as vllm_qwen1_5_0_5b_chat_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_6b_chat import \
+    from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
         models as hf_yi_1_5_6b_chat_model  # noqa: F401, E501
-    from ...configs.models.yi.hf_yi_1_5_9b_chat import \
+    from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
         models as hf_yi_1_5_9b_chat_model  # noqa: F401, E501
-    from ...configs.summarizers.medium import summarizer  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
 
 models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+model_name = ''
+
+models.append(
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://10.1.9.14:10001/v1',
+        path='compass_judger_internlm2_102b_0508',
+        tokenizer_path='internlm/internlm2_5-20b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=50,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=3,
+    ))
+
 for d in datasets:
     d['reader_cfg']['test_range'] = '[0:100]'
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
@@ -8,22 +8,25 @@
 
 chat_model_list = [
     'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
-    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind',
+    'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
+    'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
     'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
-    'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf',
-    'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
-    'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
-    'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
-    'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
-    'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf'
+    'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
+    'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
+    'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
+    'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
+    'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
+    'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
+    'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
 ]
 base_model_list = [
     'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
     'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
-    'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
-    'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
-    'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf',
-    'yi-1.5-9b-hf'
+    'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
+    'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
+    'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
+    'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
+    'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
 ]
 dataset_list = ['gsm8k', 'race-middle', 'race-high']
 
@@ -77,6 +80,50 @@ def test_model_dataset_score(self, baseline_scores, result_scores, model,
         assert_score(result_score, base_score)
 
 
+@pytest.mark.usefixtures('result_scores')
+class TestCmdCase:
+
+    @pytest.mark.case1
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-hf', 'race-middle'),
+                              ('internlm2_5-7b-hf', 'race-high')])
+    def test_cmd_case1(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case2
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-chat-turbomind', 'race-middle'),
+                              ('internlm2_5-7b-chat-turbomind', 'race-high')])
+    def test_cmd_case2(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case3
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b_hf', 'race-middle'),
+                              ('internlm2_5-7b_hf', 'race-high')])
+    def test_cmd_case3(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+    @pytest.mark.case4
+    @pytest.mark.parametrize('model, dataset',
+                             [('internlm2_5-7b-chat_hf', 'race-middle'),
+                              ('internlm2_5-7b-chat_hf', 'race-high')])
+    def test_cmd_case4(self, result_scores, model, dataset):
+        if len(result_scores.keys()) != 1:
+            assert False, 'result is none'
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 91)
+
+
 def assert_score(score, baseline):
     if score is None or score == '-':
         assert False, 'value is none'

diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
@@ -3,6 +3,11 @@ baichuan2-7b-chat-hf:
     race-middle: 74
     race-high: 79
 
+glm-4-9b-chat-hf:
+    gsm8k: 75
+    race-middle: 88
+    race-high: 88
+
 deepseek-7b-chat-hf:
     gsm8k: 60
     race-middle: 74
@@ -23,6 +28,16 @@ gemma-7b-it-hf:
     race-middle: 74
     race-high: 71
 
+internlm2_5-7b-chat-hf:
+    gsm8k: 86
+    race-middle: 92
+    race-high: 93
+
+internlm2_5-7b-chat-turbomind:
+    gsm8k: 87
+    race-middle: 92
+    race-high: 93
+
 internlm2-chat-1.8b-turbomind:
     gsm8k: 40
     race-middle: 82
@@ -108,6 +123,10 @@ deepseek-moe-16b-base-hf:
     race-middle: 35
     race-high: 23
 
+lmdeploy-api-test:
+    gsm8k: 90
+    race-middle: 95
+    race-high: 96
 
 deepseek-7b-base-turbomind:
     gsm8k: 21
@@ -124,8 +143,18 @@ gemma-7b-hf:
     race-middle: 59
     race-high: 66
 
+internlm2_5-7b-hf:
+    gsm8k: 46
+    race-middle: 92
+    race-high: 91
+
+internlm2_5-7b-turbomind:
+    gsm8k: 73
+    race-middle: 90
+    race-high: 91
+
 internlm2-1.8b-turbomind:
-    gsm8k: 27
+    gsm8k: 25
     race-middle: 75
     race-high: 72
 
@@ -134,6 +163,11 @@ internlm2-7b-turbomind:
     race-middle: 78
     race-high: 76
 
+internlm2-base-7b-hf:
+    gsm8k: 2
+    race-middle: 71
+    race-high: 74
+
 internlm2-base-7b-turbomind:
     gsm8k: 39
     race-middle: 75