diff --git a/.gitignore b/.gitignore index d4c9e2a60..57da31f0b 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ configs/eval_debug*.py configs/viz_*.py configs/**/*_bkup.py opencompass/**/*_bkup.py -data work_dirs outputs models/* diff --git a/configs/datasets/subjective/multiround/mtbench101_judge.py b/configs/datasets/subjective/multiround/mtbench101_judge.py index 74e6b8d5b..4111ed1fb 100644 --- a/configs/datasets/subjective/multiround/mtbench101_judge.py +++ b/configs/datasets/subjective/multiround/mtbench101_judge.py @@ -15,6 +15,18 @@ 'mtbench101_test', 'mtbench101_ja', 'mtbench101_ja_test', + 'mtbench101_ar', + 'mtbench101_ar_test', + 'mtbench101_id', + 'mtbench101_id_test', + 'mtbench101_fr', + 'mtbench101_fr_test', + 'mtbench101_de', + 'mtbench101_de_test', + 'mtbench101_it', + 'mtbench101_it_test', + 'mtbench101_es', + 'mtbench101_es_test' ] data_path = 'data/subjective/' diff --git a/data/subjective/translation.py b/data/subjective/translation.py new file mode 100644 index 000000000..64d4d6056 --- /dev/null +++ b/data/subjective/translation.py @@ -0,0 +1,41 @@ +import jsonlines +from tqdm import tqdm +from singularity_nlp.util.translation import translate + + +def data_translate(input_file, output_file, line_cnt=None, target='ja', source='en'): + data_ja = [] + with jsonlines.open(input_file, 'r') as fr, jsonlines.open(output_file, 'w') as fw: + for item in tqdm(fr, total=line_cnt): + history_ja = [] + flag = True + for turn in item['history']: + turn_ja = {} + for k, v in turn.items(): + v_ja = translate(v, target=target, source=source, format='text') + if v_ja: + turn_ja[k] = v_ja + else: + flag = False # 标识一下,整个item都要舍弃 + break + if flag is False: + break + history_ja.append(turn_ja) + + if flag: + item_ja = item.copy() + item_ja['history'] = history_ja + data_ja.append(item_ja) + fw.write(item_ja) + return data_ja + + + +if __name__ == '__main__': + data_ja = data_translate('mtbench101.jsonl', 'mtbench101_ja.jsonl', line_cnt=1388, target='ja', source='en') + data_ar = data_translate('mtbench101.jsonl', 'mtbench101_ar.jsonl', line_cnt=1388, target='ar', source='en') + data_id = data_translate('mtbench101.jsonl', 'mtbench101_id.jsonl', line_cnt=1388, target='id', source='en') + data_fr = data_translate('mtbench101.jsonl', 'mtbench101_fr.jsonl', line_cnt=1388, target='fr', source='en') + data_de = data_translate('mtbench101.jsonl', 'mtbench101_de.jsonl', line_cnt=1388, target='de', source='en') + data_it = data_translate('mtbench101.jsonl', 'mtbench101_it.jsonl', line_cnt=1388, target='it', source='en') + data_es = data_translate('mtbench101.jsonl', 'mtbench101_es.jsonl', line_cnt=1388, target='es', source='en') diff --git a/liuyao/eval_subjective_mtbench101_id.py b/liuyao/eval_subjective_mtbench101_id.py new file mode 100644 index 000000000..c78563e95 --- /dev/null +++ b/liuyao/eval_subjective_mtbench101_id.py @@ -0,0 +1,452 @@ +from opencompass.models import OpenAI, VLLM, VLLMwithChatTemplate +from opencompass.models.template import api_meta_template, qwen_meta_template +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import MTBench101Summarizer +from opencompass.partitioners import SizePartitioner, NaivePartitioner +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask +from mmengine.config import read_base +with read_base(): + from ..configs.datasets.subjective.multiround.mtbench101_judge import subjective_datasets + + +work_dir = 'outputs/mtbench101_id/' +GPU_NUMS = 1 +GPU_NUMS2 = 2 + +prefix = '/maindata/data/user/ai_story/yao.liu/multilingual/Indonesian' +v0_3_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_expand_trans_write_chatedit/checkpoints/checkpoint-471' # 最小loss +v0_3_ep2 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_expand_trans_write_chatedit/checkpoints/checkpoint-943' + +v0_4_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-13' +v0_4_ep2 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-27' # 最小loss +v0_4_ep3 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-40' +v0_4_ep4 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR5e-6_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-52' + +v0_5_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-13' +v0_5_ep2 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-27' # 最小loss +v0_5_ep3 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-40' +v0_5_ep4 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-52' + +v0_6_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422_chatedit0508/checkpoints/checkpoint-10' +v0_6_ep2 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422_chatedit0508/checkpoints/checkpoint-20' # 最小loss +v0_6_ep3 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422_chatedit0508/checkpoints/checkpoint-30' +v0_6_ep4 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422_chatedit0508/checkpoints/checkpoint-40' + +v0_7_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422/checkpoints/checkpoint-7' # 最小loss +v0_7_ep2 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422/checkpoints/checkpoint-14' +v0_7_ep3 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422/checkpoints/checkpoint-21' +v0_7_ep4 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_write0422/checkpoints/checkpoint-28' + +v0_8_ep1 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_chatedit0508/checkpoints/checkpoint-3' # 最小loss +v0_8_ep3 = f'{prefix}/Sailor-7B-Chat_SFT_SEQ4096_LR1e-5_EP4_GBS8x1x2_20240628_chatedit0508/checkpoints/checkpoint-10' + +v1_1_ep1 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR1e-5_EP4_GBS32x1x1_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-7' +v1_1_ep2 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR1e-5_EP4_GBS32x1x1_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-14' # 最小loss +v1_1_ep3 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR1e-5_EP4_GBS32x1x1_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-21' +v1_1_ep4 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR1e-5_EP4_GBS32x1x1_20240628_trans151_write0422_chatedit0508/checkpoints/checkpoint-28' + +v1_2_ep1 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR5e-6_EP4_GBS32x1x1_20240628_expand_trans_write_chatedit//checkpoints/checkpoint-236' # 最小loss +v1_2_ep2 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR5e-6_EP4_GBS32x1x1_20240628_expand_trans_write_chatedit//checkpoints/checkpoint-472' +v1_2_ep3 = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR5e-6_EP4_GBS32x1x1_20240628_expand_trans_write_chatedit//checkpoints/checkpoint-708' + + +models = [ + dict( + abbr='id_v0_3_ep1', + type=VLLM, + path=v0_3_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_3_ep2', + type=VLLM, + path=v0_3_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_4_ep1', + type=VLLM, + path=v0_4_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_4_ep2', + type=VLLM, + path=v0_4_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_4_ep3', + type=VLLM, + path=v0_4_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_4_ep4', + type=VLLM, + path=v0_4_ep4, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_5_ep1', + type=VLLM, + path=v0_5_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_5_ep2', + type=VLLM, + path=v0_5_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_5_ep3', + type=VLLM, + path=v0_5_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_5_ep4', + type=VLLM, + path=v0_5_ep4, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_6_ep1', + type=VLLM, + path=v0_6_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_6_ep2', + type=VLLM, + path=v0_6_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_6_ep3', + type=VLLM, + path=v0_6_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_6_ep4', + type=VLLM, + path=v0_6_ep4, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_7_ep1', + type=VLLM, + path=v0_7_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_7_ep2', + type=VLLM, + path=v0_7_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_7_ep3', + type=VLLM, + path=v0_7_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_7_ep4', + type=VLLM, + path=v0_7_ep4, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_8_ep1', + type=VLLM, + path=v0_8_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v0_8_ep3', + type=VLLM, + path=v0_8_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS, num_procs=1), + ), + dict( + abbr='id_v1_1_ep1', + type=VLLM, + path=v1_1_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_1_ep2', + type=VLLM, + path=v1_1_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_1_ep3', + type=VLLM, + path=v1_1_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_1_ep4', + type=VLLM, + path=v1_1_ep4, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_2_ep1', + type=VLLM, + path=v1_2_ep1, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_2_ep2', + type=VLLM, + path=v1_2_ep2, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ), + dict( + abbr='id_v1_2_ep3', + type=VLLM, + path=v1_2_ep3, + model_kwargs=dict(tensor_parallel_size=GPU_NUMS2), + meta_template=qwen_meta_template, + generation_kwargs=dict(do_sample=True), + max_seq_len=4096, + max_out_len=4096, + batch_size=4, + stop_words=['<|im_end|>'], + run_cfg=dict(num_gpus=GPU_NUMS2, num_procs=1), + ) +] +# models_target = ['id_v0_3_ep1', 'id_v0_3_ep2', 'id_v0_4_ep1', 'id_v0_4_ep2', 'id_v0_4_ep3', 'id_v0_4_ep4', 'id_v0_5_ep1', 'id_v0_5_ep2', 'id_v0_5_ep3', 'id_v0_5_ep4'] # , 'id_v0_6_ep1', 'id_v0_6_ep2', 'id_v0_6_ep3', 'id_v0_6_ep4' +# models = [x for x in models if x['abbr'] in models_target] +datasets = [x for x in subjective_datasets if x['abbr'] in ['mtbench101_id']] + +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=4096, + max_seq_len=4096, + batch_size=8, + temperature=0.8, +)] + +infer = dict( + partitioner=dict( + type=SizePartitioner, + max_task_size=10000 + ), + runner=dict( + type=LocalRunner, + max_num_workers=32, # TODO 尝试下更多workers,也尝试下更大num_procs和batch_size,看看速度有没有提升 + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + max_task_size=10000, + mode='singlescore', + models=models, + judge_models=judge_models + ), + runner=dict( + type=LocalRunner, + max_num_workers=8, + task=dict(type=SubjectiveEvalTask) + ), +) + +summarizer = dict(type=MTBench101Summarizer, judge_type='single') diff --git a/liuyao/eval_subjective_mtbench101.py b/liuyao/eval_subjective_mtbench101_ja.py similarity index 80% rename from liuyao/eval_subjective_mtbench101.py rename to liuyao/eval_subjective_mtbench101_ja.py index b481b8778..24fb36a4c 100644 --- a/liuyao/eval_subjective_mtbench101.py +++ b/liuyao/eval_subjective_mtbench101_ja.py @@ -1,4 +1,5 @@ from opencompass.models import OpenAI, VLLM, VLLMwithChatTemplate +from opencompass.models.template import qwen_meta_template, karakuri_meta_template, suzume_meta_template, api_meta_template from opencompass.partitioners.sub_size import SubjectiveSizePartitioner from opencompass.runners import LocalRunner from opencompass.tasks.subjective_eval import SubjectiveEvalTask @@ -10,37 +11,8 @@ from ..configs.datasets.subjective.multiround.mtbench101_judge import subjective_datasets -api_meta_template = dict( - round=[ - dict(role='SYSTEM', api_role='SYSTEM'), - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) -qwen_meta_template = dict( - round=[ - dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), - ], - eos_token_id=151645, -) -karakuri_meta_template = dict( - round=[ - dict(role="HUMAN", begin='[INST] ', end=' [ATTR] helpfulness: 4 correctness: 4 coherence: 4 complexity: 4 verbosity: 4 quality: 4 toxicity: 0 humor: 0 creativity: 0 [/ATTR] [/INST]'), - dict(role="BOT", begin="", end='', generate=True), - ], - eos_token_id=2, -) -suzume_meta_template = dict( - round=[ - dict(role="HUMAN", begin='<|start_header_id|>user<|end_header_id|>\n\n', end='<|eot_id|>'), - dict(role="BOT", begin="<|start_header_id|>assistant<|end_header_id|>\n\n", end='<|eot_id|>', generate=True), - ], - eos_token_id=128009, -) GPU_NUMS = 2 - prefix = '/maindata/data/user/ai_story/yao.liu/multilingual/Japanese' v1_6_path = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR5e-6_EP4_GBS32x1x1_NEFT0_20240609_synthetic0530_common2/checkpoints/checkpoint-260' v1_5_path = f'{prefix}/Qwen2-57B-A14B-Instruct_SFT_SEQ4096_LR5e-6_EP4_GBS32x1x1_NEFT0_20240609_synthetic0530_common1/checkpoints/checkpoint-260' @@ -118,7 +90,6 @@ ] datasets = [x for x in subjective_datasets if x['abbr'] in ['mtbench101_ja']] - judge_models = [dict( abbr='GPT4-Turbo', type=OpenAI, @@ -161,4 +132,4 @@ summarizer = dict(type=MTBench101Summarizer, judge_type='single') -work_dir = 'outputs/mtbench101/' \ No newline at end of file +work_dir = 'outputs/mtbench101_ja/' diff --git a/liuyao/script.sh b/liuyao/script.sh index 7e08fbda6..baa9daebe 100644 --- a/liuyao/script.sh +++ b/liuyao/script.sh @@ -2,9 +2,19 @@ export MKL_SERVICE_FORCE_INTEL=1 export MKL_THREADING_LAYER=1 # 添加以上两行,否则报错:[[0m Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library. -python run.py liuyao/eval_subjective_mtbench101.py --debug +# Japanese /maindata/data/user/ai_story/zhigong.wang/miniconda3/envs/opencompass/bin/python run.py liuyao/eval_subjective_mtbench101.py --reuse 20240625_105451 -python run.py liuyao/eval_subjective_mtbench101.py --reuse 20240624_000906 --debug -python run.py liuyao/eval_subjective_mtbench101.py --mode infer --debug -python run.py liuyao/eval_subjective_mtbench101.py --mode eval --reuse 20240625_145959 -python run.py liuyao/eval_subjective_mtbench101.py --mode viz --reuse 20240625_145959 +python run.py liuyao/eval_subjective_mtbench101_ja.py --debug +python run.py liuyao/eval_subjective_mtbench101_ja.py --reuse 20240624_000906 --debug +python run.py liuyao/eval_subjective_mtbench101_ja.py --mode infer --debug +python run.py liuyao/eval_subjective_mtbench101_ja.py --mode eval --reuse 20240625_145959 +python run.py liuyao/eval_subjective_mtbench101_ja.py --mode viz --reuse 20240625_145959 + +# Indonesian +python run.py liuyao/eval_subjective_mtbench101_id.py --reuse --debug +python run.py liuyao/eval_subjective_mtbench101_id.py --mode infer --reuse +python run.py liuyao/eval_subjective_mtbench101_id.py --mode eval --reuse + + +# TODO 数据集里的问题有些多,要不要删减一些,以节省GPT4 token + diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py index fd91138c4..99142ca4e 100644 --- a/opencompass/datasets/subjective/mtbench101.py +++ b/opencompass/datasets/subjective/mtbench101.py @@ -242,7 +242,7 @@ def eval_prompt_construct(task, ref_answer, history): else: system_prompt = judge + unique_prompt[task] + score_format prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***'.format( - history=history, prediction='{prediction}') # YAO: history被填充,但prediction不填充(TODO 后续再填充?),因为这是template? + history=history, prediction='{prediction}') # YAO: history被填充,但prediction不填充(后续会填充) TODO 添加非英语时的language说明? return system_prompt, prompt_template # YAO: 分别交由system和human来说 diff --git a/opencompass/models/template.py b/opencompass/models/template.py new file mode 100644 index 000000000..0526a8179 --- /dev/null +++ b/opencompass/models/template.py @@ -0,0 +1,29 @@ + +api_meta_template = dict( + round=[ + dict(role='SYSTEM', api_role='SYSTEM'), + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) +qwen_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), + ], + eos_token_id=151645, +) +karakuri_meta_template = dict( + round=[ + dict(role="HUMAN", begin='[INST] ', end=' [ATTR] helpfulness: 4 correctness: 4 coherence: 4 complexity: 4 verbosity: 4 quality: 4 toxicity: 0 humor: 0 creativity: 0 [/ATTR] [/INST]'), + dict(role="BOT", begin="", end='', generate=True), + ], + eos_token_id=2, +) +suzume_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|start_header_id|>user<|end_header_id|>\n\n', end='<|eot_id|>'), + dict(role="BOT", begin="<|start_header_id|>assistant<|end_header_id|>\n\n", end='<|eot_id|>', generate=True), + ], + eos_token_id=128009, +)