Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] add commond testcase into daily testcase #1447

Merged
merged 22 commits into from
Aug 22, 2024
Merged
51 changes: 30 additions & 21 deletions .github/scripts/eval_regression_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,57 @@

with read_base():
# choose a list of datasets
from ...configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
gsm8k_datasets # noqa: F401, E501
from ...configs.datasets.race.race_ppl import \
from opencompass.configs.datasets.race.race_ppl import \
race_datasets # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_moe_16b_base import \
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
# read hf models - chat models
from ...configs.models.deepseek.lmdeploy_deepseek_7b_base import \
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
from ...configs.models.deepseek.vllm_deepseek_moe_16b_base import \
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_2b import \
from opencompass.configs.models.gemma.hf_gemma_2b import \
models as hf_gemma_2b_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_7b import \
from opencompass.configs.models.gemma.hf_gemma_7b import \
models as hf_gemma_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
models as hf_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
models as hf_internlm2_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
models as hf_internlm2_base_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
from ...configs.models.hf_llama.lmdeploy_llama3_8b import \
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
models as lmdeploy_llama3_8b_model # noqa: F401, E501
from ...configs.models.mistral.hf_mistral_7b_v0_2 import \
from opencompass.configs.models.mistral.hf_mistral_7b_v0_2 import \
models as hf_mistral_7b_v0_2_model # noqa: F401, E501
from ...configs.models.mistral.vllm_mistral_7b_v0_2 import \
from opencompass.configs.models.mistral.vllm_mistral_7b_v0_2 import \
models as vllm_mistral_7b_v0_2_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen2_0_5b import \
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
models as hf_qwen2_0_5b_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_1_5b import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_7b import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
models as lmdeploy_qwen2_7b_model # noqa: F401, E501
from ...configs.models.qwen.vllm_qwen1_5_0_5b import \
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
models as vllm_qwen1_5_0_5b_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_6b import \
from opencompass.configs.models.yi.hf_yi_1_5_6b import \
models as hf_yi_1_5_6b_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_9b import \
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
models as hf_yi_1_5_9b_model # noqa: F401, E501
from ...configs.summarizers.medium import summarizer # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
Expand Down
95 changes: 65 additions & 30 deletions .github/scripts/eval_regression_chat.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,105 @@
from mmengine.config import read_base

from opencompass.models import OpenAISDK

with read_base():
# choose a list of datasets
from ...configs.datasets.gsm8k.gsm8k_gen import \
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
gsm8k_datasets # noqa: F401, E501
from ...configs.datasets.race.race_gen import \
from opencompass.configs.datasets.race.race_gen import \
race_datasets # noqa: F401, E501
# read hf models - chat models
from ...configs.models.baichuan.hf_baichuan2_7b_chat import \
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
models as hf_baichuan2_7b_chat_model # noqa: F401, E501
from ...configs.models.chatglm.hf_glm4_9b_chat import \
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
models as hf_glm4_9b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_7b_chat import \
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
models as hf_deepseek_7b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.hf_deepseek_moe_16b_chat import \
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
from ...configs.models.deepseek.vllm_deepseek_7b_chat import \
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_2b_it import \
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
models as hf_gemma_2b_it_model # noqa: F401, E501
from ...configs.models.gemma.hf_gemma_7b_it import \
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
models as hf_gemma_7b_it_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
models as lmdeploy_internlm2_chat_1_8b_sft_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
from ...configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
from ...configs.models.hf_internlm.vllm_internlm2_chat_7b import \
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
from ...configs.models.hf_llama.hf_llama3_8b_instruct import \
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama3_8b_instruct_model # noqa: F401, E501
from ...configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
from ...configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from ...configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_dpo_fp32 import \
models as hf_minicpm_2b_dpo_fp32_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import \
models as hf_minicpm_2b_sft_bf16_model # noqa: F401, E501
from ...configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_fp32 import \
models as hf_minicpm_2b_sft_fp32_model # noqa: F401, E501
from ...configs.models.phi.hf_phi_3_mini_4k_instruct import \
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
from ...configs.models.phi.hf_phi_3_small_8k_instruct import \
from opencompass.configs.models.phi.hf_phi_3_small_8k_instruct import \
models as hf_phi_3_mini_8k_instruct_model # noqa: F401, E501
from ...configs.models.qwen.hf_qwen1_5_0_5b_chat import \
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501
from ...configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct_model # noqa: F401, E501
from ...configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
models as vllm_qwen1_5_0_5b_chat_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_6b_chat import \
from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
from ...configs.models.yi.hf_yi_1_5_9b_chat import \
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
from ...configs.summarizers.medium import summarizer # noqa: F401, E501
from opencompass.configs.summarizers.medium import \
summarizer # noqa: F401, E501

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)

model_name = ''

models.append(
dict(
abbr='lmdeploy-api-test',
type=OpenAISDK,
key='EMPTY',
openai_api_base='http://10.1.9.14:10001/v1',
path='compass_judger_internlm2_102b_0508',
tokenizer_path='internlm/internlm2_5-20b-chat',
rpm_verbose=True,
meta_template=api_meta_template,
query_per_second=50,
max_out_len=1024,
max_seq_len=4096,
temperature=0.01,
batch_size=128,
retry=3,
))

for d in datasets:
d['reader_cfg']['test_range'] = '[0:100]'
69 changes: 58 additions & 11 deletions .github/scripts/oc_score_assert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,25 @@

chat_model_list = [
'baichuan2-7b-chat-hf', 'deepseek-7b-chat-hf', 'deepseek-moe-16b-chat-hf',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2-chat-1.8b-turbomind',
'gemma-2b-it-hf', 'gemma-7b-it-hf', 'internlm2_5-7b-chat-hf',
'internlm2_5-7b-chat-turbomind', 'internlm2-chat-1.8b-turbomind',
'internlm2-chat-1.8b-sft-turbomind', 'internlm2-chat-7b-turbomind',
'internlm2-chat-7b-sft-turbomind', 'llama-3-8b-instruct-hf',
'llama-3-8b-instruct-turbomind', 'mistral-7b-instruct-v0.2-hf',
'minicpm-2b-dpo-fp32-hf', 'minicpm-2b-sft-bf16-hf',
'minicpm-2b-sft-fp32-hf', 'phi-3-mini-4k-instruct-hf',
'qwen1.5-0.5b-chat-hf', 'qwen2-1.5b-instruct-turbomind',
'qwen2-7b-instruct-turbomind', 'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf'
'internlm2-chat-7b-sft-turbomind', 'internlm2_5-7b-chat-turbomind',
'llama-3-8b-instruct-hf', 'llama-3-8b-instruct-turbomind',
'mistral-7b-instruct-v0.2-hf', 'minicpm-2b-dpo-fp32-hf',
'minicpm-2b-sft-bf16-hf', 'minicpm-2b-sft-fp32-hf',
'phi-3-mini-4k-instruct-hf', 'qwen1.5-0.5b-chat-hf',
'qwen2-1.5b-instruct-turbomind', 'qwen2-7b-instruct-turbomind',
'yi-1.5-6b-chat-hf', 'yi-1.5-9b-chat-hf', 'lmdeploy-api-test'
]
base_model_list = [
'deepseek-moe-16b-base-hf', 'deepseek-7b-base-turbomind', 'gemma-2b-hf',
'gemma-7b-hf', 'internlm2-1.8b-turbomind', 'internlm2-7b-turbomind',
'internlm2-base-7b-turbomind', 'llama-3-8b-turbomind',
'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf', 'qwen2-0.5b-hf',
'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind', 'yi-1.5-6b-hf',
'yi-1.5-9b-hf'
'internlm2_5-7b-turbomind', 'internlm2_5-7b-hf',
'internlm2-base-7b-turbomind', 'internlm2-base-7b-hf',
'llama-3-8b-turbomind', 'mistral-7b-v0.2-hf', 'qwen1.5-moe-a2.7b-hf',
'qwen2-0.5b-hf', 'qwen2-1.5b-turbomind', 'qwen2-7b-turbomind',
'yi-1.5-6b-hf', 'yi-1.5-9b-hf'
]
dataset_list = ['gsm8k', 'race-middle', 'race-high']

Expand Down Expand Up @@ -77,6 +80,50 @@ def test_model_dataset_score(self, baseline_scores, result_scores, model,
assert_score(result_score, base_score)


@pytest.mark.usefixtures('result_scores')
class TestCmdCase:

@pytest.mark.case1
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-hf', 'race-middle'),
('internlm2_5-7b-hf', 'race-high')])
def test_cmd_case1(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)

@pytest.mark.case2
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat-turbomind', 'race-middle'),
('internlm2_5-7b-chat-turbomind', 'race-high')])
def test_cmd_case2(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)

@pytest.mark.case3
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b_hf', 'race-middle'),
('internlm2_5-7b_hf', 'race-high')])
def test_cmd_case3(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)

@pytest.mark.case4
@pytest.mark.parametrize('model, dataset',
[('internlm2_5-7b-chat_hf', 'race-middle'),
('internlm2_5-7b-chat_hf', 'race-high')])
def test_cmd_case4(self, result_scores, model, dataset):
if len(result_scores.keys()) != 1:
assert False, 'result is none'
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 91)


def assert_score(score, baseline):
if score is None or score == '-':
assert False, 'value is none'
Expand Down
36 changes: 35 additions & 1 deletion .github/scripts/oc_score_baseline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ baichuan2-7b-chat-hf:
race-middle: 74
race-high: 79

glm-4-9b-chat-hf:
gsm8k: 75
race-middle: 88
race-high: 88

deepseek-7b-chat-hf:
gsm8k: 60
race-middle: 74
Expand All @@ -23,6 +28,16 @@ gemma-7b-it-hf:
race-middle: 74
race-high: 71

internlm2_5-7b-chat-hf:
gsm8k: 86
race-middle: 92
race-high: 93

internlm2_5-7b-chat-turbomind:
gsm8k: 87
race-middle: 92
race-high: 93

internlm2-chat-1.8b-turbomind:
gsm8k: 40
race-middle: 82
Expand Down Expand Up @@ -108,6 +123,10 @@ deepseek-moe-16b-base-hf:
race-middle: 35
race-high: 23

lmdeploy-api-test:
gsm8k: 90
race-middle: 95
race-high: 96

deepseek-7b-base-turbomind:
gsm8k: 21
Expand All @@ -124,8 +143,18 @@ gemma-7b-hf:
race-middle: 59
race-high: 66

internlm2_5-7b-hf:
gsm8k: 46
race-middle: 92
race-high: 91

internlm2_5-7b-turbomind:
gsm8k: 73
race-middle: 90
race-high: 91

internlm2-1.8b-turbomind:
gsm8k: 27
gsm8k: 25
race-middle: 75
race-high: 72

Expand All @@ -134,6 +163,11 @@ internlm2-7b-turbomind:
race-middle: 78
race-high: 76

internlm2-base-7b-hf:
gsm8k: 2
race-middle: 71
race-high: 74

internlm2-base-7b-turbomind:
gsm8k: 39
race-middle: 75
Expand Down
Loading
Loading