-
Notifications
You must be signed in to change notification settings - Fork 405
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add compass arena * add compass_arena * add compass arena * Update opencompass/summarizers/subjective/compass_arena.py Co-authored-by: Songyang Zhang <[email protected]> * Update opencompass/summarizers/subjective/__init__.py Co-authored-by: Songyang Zhang <[email protected]> * Update opencompass/datasets/subjective/compass_arena.py Co-authored-by: Songyang Zhang <[email protected]> * Update opencompass/datasets/subjective/__init__.py Co-authored-by: Songyang Zhang <[email protected]> * Update configs/eval_subjective_compassarena.py Co-authored-by: Songyang Zhang <[email protected]> * Update configs/datasets/subjective/compassarena/compassarena_compare.py Co-authored-by: Songyang Zhang <[email protected]> * Update configs/eval_subjective_compassarena.py Co-authored-by: Songyang Zhang <[email protected]> * Update configs/datasets/subjective/compassarena/compassarena_compare.py Co-authored-by: Songyang Zhang <[email protected]> * fix check position bias --------- Co-authored-by: Songyang Zhang <[email protected]>
- Loading branch information
1 parent
40a2441
commit 2d4da8d
Showing
7 changed files
with
461 additions
and
1 deletion.
There are no files selected for viewing
160 changes: 160 additions & 0 deletions
160
configs/datasets/subjective/compassarena/compassarena_compare.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
from opencompass.openicl.icl_prompt_template import PromptTemplate | ||
from opencompass.openicl.icl_retriever import ZeroRetriever | ||
from opencompass.openicl.icl_inferencer import GenInferencer | ||
from opencompass.openicl.icl_evaluator import LMEvaluator | ||
from opencompass.datasets import CompassArenaDataset | ||
|
||
subjective_reader_cfg = dict( | ||
input_columns=['question', 'ref'], | ||
output_column='judge', | ||
) | ||
|
||
data_path ="data/subjective/" | ||
|
||
subjective_datasets = [] | ||
|
||
base_prompt = """ | ||
[回答1开始] | ||
{prediction} | ||
[回答1结束] | ||
[回答2开始] | ||
{prediction2} | ||
[回答2结束] | ||
根据评分要求,在以下 3 个选项中做出选择: | ||
A. 回答1更好 | ||
B. 回答2更好 | ||
C. 回答1、2平局 | ||
并提供你的解释原因。 | ||
如果你认为回答1更好,你的输出应形如: | ||
选择:A | ||
原因:blahblah blahblah\n | ||
如果你认为回答2更好,你的输出应形如: | ||
选择:B | ||
原因:blahblah blahblah\n | ||
如果你认为回答1、2打成平手,你的输出应形如: | ||
选择:C | ||
原因:blahblah blahblah\n | ||
""" | ||
|
||
knowledge_prompt = """ | ||
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 更好的回答能与参考答案吻合或表明参考答案的意思。 | ||
2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。 | ||
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" + base_prompt | ||
|
||
|
||
language_prompt = """ | ||
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。 | ||
2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等 | ||
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" + base_prompt | ||
|
||
|
||
math_prompt = """ | ||
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 更好的回答的答案能和参考答案一致。 | ||
2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。 | ||
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" + base_prompt | ||
|
||
reason_prompt = math_prompt | ||
|
||
qa_prompt = """ | ||
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 好的回答必须首先具有事实正确性,即除了想象的内容外,所引用或阐述的各种信息都是真实正确的 | ||
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答,且前后连贯,逻辑没有问题 | ||
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误 | ||
[用户问题] | ||
{question} | ||
""" + base_prompt | ||
|
||
|
||
|
||
creation_prompt = """ | ||
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 | ||
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答 | ||
3. 好的回答必须具有创造性的词语和表达丰富度 | ||
[用户问题] | ||
{question} | ||
""" + base_prompt | ||
|
||
|
||
subjective_all_sets = ["knowledge", "language", "math", "reason", "qa", "creationv2_zh"] | ||
prompt_all_sets = [knowledge_prompt, language_prompt, math_prompt, reason_prompt, qa_prompt, creation_prompt] | ||
|
||
for _name,_prompt in zip(subjective_all_sets, prompt_all_sets): | ||
subjective_infer_cfg = dict( | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict(round=[ | ||
dict( | ||
role='HUMAN', | ||
prompt="{question}" | ||
), | ||
]), | ||
), | ||
retriever=dict(type=ZeroRetriever), | ||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), | ||
) | ||
|
||
subjective_eval_cfg = dict( | ||
evaluator=dict( | ||
type=LMEvaluator, | ||
infer_order='double', | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict(round=[ | ||
dict( | ||
role='HUMAN', | ||
prompt = _prompt | ||
), | ||
]), | ||
), | ||
), | ||
pred_role="BOT", | ||
) | ||
|
||
subjective_datasets.append( | ||
dict( | ||
abbr=f"{_name}", | ||
type=CompassArenaDataset, | ||
path=data_path, | ||
name=_name, | ||
reader_cfg=subjective_reader_cfg, | ||
infer_cfg=subjective_infer_cfg, | ||
eval_cfg=subjective_eval_cfg | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from os import getenv as gv | ||
from opencompass.models import HuggingFaceCausalLM | ||
from mmengine.config import read_base | ||
with read_base(): | ||
from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model | ||
from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model | ||
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets | ||
|
||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI | ||
from opencompass.models.openai_api import OpenAIAllesAPIN | ||
from opencompass.partitioners import NaivePartitioner, SizePartitioner | ||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner | ||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner | ||
from opencompass.runners import LocalRunner | ||
from opencompass.runners import SlurmSequentialRunner | ||
from opencompass.tasks import OpenICLInferTask | ||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask | ||
from opencompass.summarizers import CompassArenaSummarizer | ||
|
||
infer = dict( | ||
#partitioner=dict(type=NaivePartitioner), | ||
partitioner=dict(type=SizePartitioner, max_task_size=10000), | ||
runner=dict( | ||
type=SlurmSequentialRunner, | ||
partition='llm_dev2', | ||
quotatype='auto', | ||
max_num_workers=256, | ||
task=dict(type=OpenICLInferTask)), | ||
) | ||
|
||
api_meta_template = dict( | ||
round=[ | ||
dict(role='HUMAN', api_role='HUMAN'), | ||
dict(role='BOT', api_role='BOT', generate=True), | ||
] | ||
) | ||
|
||
gpt4 = dict( | ||
abbr='gpt4-turbo', | ||
type=OpenAI, path='gpt-4-1106-preview', | ||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well | ||
meta_template=api_meta_template, | ||
query_per_second=1, | ||
max_out_len=2048, | ||
max_seq_len=4096, | ||
batch_size=4, | ||
retry=20, | ||
temperature = 1 | ||
) | ||
models = [*chatglm3_6b_32k_model, *yi_6b_chat_model] | ||
datasets = [*subjective_datasets] | ||
|
||
|
||
|
||
work_dir = 'outputs/compass_arena/' | ||
|
||
# -------------Inferen Stage ---------------------------------------- | ||
|
||
judge_model = dict( | ||
abbr='GPT4-Turbo', | ||
type=OpenAI, path='gpt-4-1106-preview', | ||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well | ||
meta_template=api_meta_template, | ||
query_per_second=1, | ||
max_out_len=1024, | ||
max_seq_len=4096, | ||
batch_size=2, | ||
retry=20, | ||
temperature = 0 | ||
) | ||
## ------------- Evaluation Configuration | ||
eval = dict( | ||
partitioner=dict( | ||
type=SubjectiveSizePartitioner, | ||
strategy='split', | ||
max_task_size=10000, | ||
mode='m2n', | ||
base_models = [gpt4], | ||
compare_models = [*chatglm3_6b_32k_model, *yi_6b_chat_model, ] | ||
), | ||
runner=dict( | ||
type=SlurmSequentialRunner, | ||
partition='llm_dev2', | ||
quotatype='auto', | ||
max_num_workers=32, | ||
task=dict( | ||
type=SubjectiveEvalTask, | ||
judge_cfg=judge_model | ||
)), | ||
) | ||
|
||
|
||
summarizer = dict( | ||
type=CompassArenaSummarizer | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from datasets import Dataset | ||
|
||
from opencompass.registry import LOAD_DATASET | ||
|
||
from .subjective_cmp import SubjectiveCmpDataset | ||
|
||
|
||
@LOAD_DATASET.register_module() | ||
class CompassArenaDataset(SubjectiveCmpDataset): | ||
|
||
def load( | ||
self, | ||
path: str, | ||
name: str, | ||
): | ||
dataset = list(super().load(path, name)) | ||
creation_dataset = [] | ||
for data in dataset: | ||
if 'reference' in data['others']: | ||
if data['others']['reference'] is not None: | ||
data['ref'] = data['others']['reference'] | ||
else: | ||
data['ref'] = '满足用户需求,言之有理即可' | ||
else: | ||
data['ref'] = '满足用户需求,言之有理即可' | ||
creation_dataset.append(data) | ||
dataset = Dataset.from_list(creation_dataset) | ||
return dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.