Skip to content

Commit

Permalink
[Feature] add support for internal Followbench (#1511)
Browse files Browse the repository at this point in the history
* fix pip version

* fix pip version

* add internal followbench

* add internal followbench

* fix lint

* fix lint
  • Loading branch information
bittersweet1999 authored Sep 11, 2024
1 parent 3177633 commit 7c7fa36
Show file tree
Hide file tree
Showing 9 changed files with 560 additions and 2 deletions.
34 changes: 33 additions & 1 deletion configs/datasets/subjective/fofo/fofo_bilingual_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
'fofo_test_prompts', 'fofo_test_prompts_cn',
]

base_prompt = """
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
Expand Down Expand Up @@ -45,9 +45,41 @@
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""


base_prompt_cn = """
我希望你创建一个排行榜,用于评估来自各种大型语言模型的回答格式的正确性。为了完成这个任务,你将需要分析给模型的文本提示以及它们对应的回答。具体来说,请确保你的评估输出正确地格式化为JSON字符串。我将为此提供提示和回答。
以下是提示内容:
{
"instruction": "{question}",
}
以下是模型的输出结果:
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式。进行彻底的格式检查,并提供格式正确或错误的详细解释。你的反馈应包括模型的名称,接着是格式正确性的状态,用'1'表示正确,'0'表示错误。将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现。换句话说,你应该生成以下输出:
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意,你的回答应是一个正确格式化的JSON字符串,不应包含任何额外的内容。我们将在Python中直接将其作为JSON字符串加载。
"""


fofo_datasets = []

for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
Expand Down
63 changes: 63 additions & 0 deletions configs/datasets/subjective/followbench/followbench_llmeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer

subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)

subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'

followbench_llmeval_dataset = []

for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
),
pred_role='BOT',
)

followbench_llmeval_dataset.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
'fofo_test_prompts', 'fofo_test_prompts_cn',
]

base_prompt = """
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
Expand Down Expand Up @@ -45,9 +45,41 @@
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""


base_prompt_cn = """
我希望你创建一个排行榜,用于评估来自各种大型语言模型的回答格式的正确性。为了完成这个任务,你将需要分析给模型的文本提示以及它们对应的回答。具体来说,请确保你的评估输出正确地格式化为JSON字符串。我将为此提供提示和回答。
以下是提示内容:
{
"instruction": "{question}",
}
以下是模型的输出结果:
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式。进行彻底的格式检查,并提供格式正确或错误的详细解释。你的反馈应包括模型的名称,接着是格式正确性的状态,用'1'表示正确,'0'表示错误。将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现。换句话说,你应该生成以下输出:
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意,你的回答应是一个正确格式化的JSON字符串,不应包含任何额外的内容。我们将在Python中直接将其作为JSON字符串加载。
"""


fofo_datasets = []

for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer

subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)

subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'

followbench_llmeval_dataset = []

for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
),
pred_role='BOT',
)

followbench_llmeval_dataset.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))
1 change: 1 addition & 0 deletions opencompass/datasets/subjective/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .corev2 import Corev2Dataset # noqa: F401, F403
from .creationbench import CreationBenchDataset # noqa: F401, F403
from .fofo import FofoDataset # noqa: F401, F403
from .followbench import FollowBenchDataset # noqa: F401, F403
from .information_retrival import IRDataset # noqa: F401, F403
from .mtbench import MTBenchDataset # noqa: F401, F403
from .mtbench101 import MTBench101Dataset # noqa: F401, F403
Expand Down
Loading

0 comments on commit 7c7fa36

Please sign in to comment.