From 0665bb91a8eccbdc203fa7e8be90e050aefe950c Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:54:19 +0800 Subject: [PATCH] [Fix] Quick fix (#995) --- configs/eval_subjective_alignbench.py | 16 +--- configs/eval_subjective_alpacaeval.py | 11 +-- configs/eval_subjective_compassarena.py | 11 --- configs/eval_subjective_corev2.py | 115 ----------------------- configs/eval_subjective_creationbench.py | 16 +--- configs/eval_subjective_mtbench.py | 16 +--- 6 files changed, 7 insertions(+), 178 deletions(-) delete mode 100644 configs/eval_subjective_corev2.py diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py index 8f60016bf..0563ff874 100644 --- a/configs/eval_subjective_alignbench.py +++ b/configs/eval_subjective_alignbench.py @@ -3,7 +3,7 @@ with read_base(): from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner @@ -51,26 +51,14 @@ datasets = [*subjective_datasets] -infer = dict( - partitioner=dict(type=NaivePartitioner), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) - # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration judge_model = dict( abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, + type=OpenAI, path='gpt-4-1106-preview', key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', meta_template=api_meta_template, query_per_second=16, max_out_len=2048, diff --git a/configs/eval_subjective_alpacaeval.py b/configs/eval_subjective_alpacaeval.py index 098547b93..13fd5ebe5 100644 --- a/configs/eval_subjective_alpacaeval.py +++ b/configs/eval_subjective_alpacaeval.py @@ -68,16 +68,7 @@ temperature=1, ) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions -infer = dict( - partitioner=dict(type=NaivePartitioner), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) + # -------------Evalation Stage ---------------------------------------- diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py index 58336a5c6..5e1f93eeb 100644 --- a/configs/eval_subjective_compassarena.py +++ b/configs/eval_subjective_compassarena.py @@ -69,17 +69,6 @@ temperature=1, ) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions -infer = dict( - partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000), - runner=dict( - type=SlurmSequentialRunner, - partition='llm_dev2', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) - # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration diff --git a/configs/eval_subjective_corev2.py b/configs/eval_subjective_corev2.py deleted file mode 100644 index 2ca07b433..000000000 --- a/configs/eval_subjective_corev2.py +++ /dev/null @@ -1,115 +0,0 @@ -from mmengine.config import read_base - -with read_base(): - from .datasets.subjective.subjective_cmp.subjective_corev2 import subjective_datasets - -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI -from opencompass.partitioners import NaivePartitioner, SizePartitioner -from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner -from opencompass.partitioners.sub_size import SubjectiveSizePartitioner -from opencompass.runners import LocalRunner -from opencompass.runners import SlurmSequentialRunner -from opencompass.tasks import OpenICLInferTask -from opencompass.tasks.subjective_eval import SubjectiveEvalTask -from opencompass.summarizers import Corev2Summarizer - -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ], - reserved_roles=[ - dict(role='SYSTEM', api_role='SYSTEM'), - ], -) - -# -------------Inference Stage ---------------------------------------- - -# For subjective evaluation, we often set do sample for models -models = [ - dict( - type=HuggingFaceChatGLM3, - abbr='chatglm3-6b-hf', - path='THUDM/chatglm3-6b', - tokenizer_path='THUDM/chatglm3-6b', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True, - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), - generation_kwargs=dict( - do_sample=True, - ), - meta_template=api_meta_template, - max_out_len=2048, - max_seq_len=4096, - batch_size=1, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] - -datasets = [*subjective_datasets] - -gpt4 = dict( - abbr='gpt4-turbo', - type=OpenAI, - path='gpt-4-1106-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=2048, - max_seq_len=4096, - batch_size=4, - retry=20, - temperature=1, -) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions - -infer = dict( - partitioner=dict(type=SizePartitioner, max_task_size=500), - runner=dict( - type=SlurmSequentialRunner, - partition='llm_dev2', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) - -# -------------Evalation Stage ---------------------------------------- - -## ------------- JudgeLLM Configuration -judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAI, - path='gpt-4-1106-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=1024, - max_seq_len=4096, - batch_size=2, - retry=20, - temperature=0, -) - -## ------------- Evaluation Configuration -eval = dict( - partitioner=dict( - type=SubjectiveSizePartitioner, mode='m2n', max_task_size=500, base_models=[gpt4], compare_models=models - ), - runner=dict( - type=SlurmSequentialRunner, - partition='llm_dev2', - quotatype='auto', - max_num_workers=256, - task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), - ), -) - -summarizer = dict(type=Corev2Summarizer, match_method='smart') - -work_dir = 'outputs/corev2/' diff --git a/configs/eval_subjective_creationbench.py b/configs/eval_subjective_creationbench.py index 52bf7d4b8..922225f11 100644 --- a/configs/eval_subjective_creationbench.py +++ b/configs/eval_subjective_creationbench.py @@ -3,7 +3,7 @@ with read_base(): from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner @@ -51,26 +51,14 @@ datasets = [*subjective_datasets] -infer = dict( - partitioner=dict(type=NaivePartitioner), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) - # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration judge_model = dict( abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, + type=OpenAI, path='gpt-4-1106-preview', key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', meta_template=api_meta_template, query_per_second=16, max_out_len=2048, diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py index 940edabb3..c8dbb23c0 100644 --- a/configs/eval_subjective_mtbench.py +++ b/configs/eval_subjective_mtbench.py @@ -4,7 +4,7 @@ from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets -from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.models.openai_api import OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner @@ -59,26 +59,14 @@ datasets = [*subjective_datasets] -infer = dict( - partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000), - runner=dict( - type=SlurmSequentialRunner, - partition='llm_dev2', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask), - ), -) - # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration judge_model = dict( abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, + type=OpenAI, path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613 key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', meta_template=api_meta_template, query_per_second=16, max_out_len=2048,