Skip to content

Commit

Permalink
[Feature] Support AlpacaEval_V2 (#1006)
Browse files Browse the repository at this point in the history
* support alpacaeval_v2

* support alpacaeval

* update docs

* update docs
  • Loading branch information
bittersweet1999 committed Mar 28, 2024
1 parent 0a6a03f commit 02e7eec
Show file tree
Hide file tree
Showing 13 changed files with 326 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
dict(
abbr=f"{_name}",
type=SubjectiveCmpDataset,
path="./data/subjective/",
path="./data/subjective/alpaca_eval",
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
dict(
abbr=f"{_name}",
type=SubjectiveCmpDataset,
path="./data/subjective/",
path="./data/subjective/alpaca_eval",
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
Expand Down
44 changes: 8 additions & 36 deletions configs/eval_subjective_alpacaeval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from mmengine.config import read_base

with read_base():
from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2

from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
Expand All @@ -12,7 +11,7 @@
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
from opencompass.summarizers import AlpacaSummarizer

api_meta_template = dict(
Expand All @@ -29,7 +28,7 @@
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
abbr='chatglm3-6b',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
Expand All @@ -54,52 +53,25 @@

datasets = [*alpacav2]

gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions



# -------------Evalation Stage ----------------------------------------

## ------------- JudgeLLM Configuration
judge_model = dict(
gpt4_judge = dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
config='weighted_alpaca_eval_gpt4_turbo'
)

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models
type=NaivePartitioner
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
type=LocalRunner,
max_num_workers=256,
task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
),
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
)
)
work_dir = 'outputs/alpaca/'

summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
105 changes: 105 additions & 0 deletions configs/eval_subjective_alpacaeval_oc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from mmengine.config import read_base

with read_base():
from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2

from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlpacaSummarizer

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)

# -------------Inference Stage ----------------------------------------

# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

datasets = [*alpacav2]

gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions



# -------------Evalation Stage ----------------------------------------

## ------------- JudgeLLM Configuration
judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
)

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
),
)
work_dir = 'outputs/alpaca/'

summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
18 changes: 14 additions & 4 deletions configs/subjective/eval_subjective_alpacaeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,24 @@
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.summarizers import AlpacaSummarizer
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
datasets = [*alpacav2]
gpt4_judge = dict(
abbr='GPT4-Turbo',
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
config='weighted_alpaca_eval_gpt4_turbo'
)
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models
type=NaivePartitioner
),
runner=runner,
given_pred=given_pred
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
)
)
work_dir = 'outputs/alpaca/'

summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
7 changes: 7 additions & 0 deletions docs/en/advanced_guides/subjective_evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ A popular evaluation method involves

We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.

## Current Supported Subjective Evaluation Datasets

1. AlginBench (https://github.com/THUDM/AlignBench)
2. MTBench (https://github.com/lm-sys/FastChat)
3. AlpacaEvalv2 (https://github.com/tatsu-lab/alpaca_eval)
4. CompassArena (Internal dataset)

## Subjective Evaluation with Custom Dataset

The specific process includes:
Expand Down
13 changes: 13 additions & 0 deletions docs/en/get_started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@

</details>

5. Install alpaca-eval (Optional):

If you want to**evaluate alpaca-eval in official ways**, follow this step.

<details>
<summary><b>click to show the details</b></summary>

```bash
pip install alpaca-eval
```

</details>

# Dataset Preparation

The datasets supported by OpenCompass mainly include two parts:
Expand Down
7 changes: 7 additions & 0 deletions docs/zh_cn/advanced_guides/subjective_evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@

我们基于以上方法支持了JudgeLLM用于模型的主观能力评估(目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用,此外一些专用的JudgeLLM我们也在计划支持中)。

## 目前已支持的主观评测数据集

1. AlginBench(https://github.com/THUDM/AlignBench)
2. MTBench (https://github.com/lm-sys/FastChat)
3. AlpacaEvalv2 (https://github.com/tatsu-lab/alpaca_eval)
4. CompassArena(内部数据集)

## 自定义主观数据集评测

主观评测的具体流程包括:
Expand Down
13 changes: 13 additions & 0 deletions docs/zh_cn/get_started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@

</details>

5. 安装 alpaca-eval(可选):

如果你需要**使用官方alpaca-eval实现评测 alpaca-eval 数据集**,请执行此步骤,否则忽略这一步。

<details>
<summary><b>点击查看详细</b></summary>

```bash
pip install alpaca-eval
```

</details>

# 数据集准备

OpenCompass 支持的数据集主要包括两个部分:
Expand Down
11 changes: 10 additions & 1 deletion opencompass/models/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def __init__(self,
meta_template: Optional[Dict] = None,
openai_api_base: str = OPENAI_API_BASE,
mode: str = 'none',
logprobs: Optional[bool] = False,
top_logprobs: Optional[int] = None,
temperature: Optional[float] = None):

super().__init__(path=path,
Expand All @@ -78,6 +80,8 @@ def __init__(self,
self.temperature = temperature
assert mode in ['none', 'front', 'mid', 'rear']
self.mode = mode
self.logprobs = logprobs
self.top_logprobs = top_logprobs

if isinstance(key, str):
self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key]
Expand Down Expand Up @@ -218,6 +222,8 @@ def _generate(self, input: str or PromptList, max_out_len: int,
messages=messages,
max_tokens=max_out_len,
n=1,
logprobs=self.logprobs,
top_logprobs=self.top_logprobs,
stop=None,
temperature=temperature,
)
Expand All @@ -234,7 +240,10 @@ def _generate(self, input: str or PromptList, max_out_len: int,
str(raw_response.content))
continue
try:
return response['choices'][0]['message']['content'].strip()
if self.logprobs:
return response['choices']
else:
return response['choices'][0]['message']['content'].strip()
except KeyError:
if 'error' in response:
if response['error']['code'] == 'rate_limit_exceeded':
Expand Down
Loading

0 comments on commit 02e7eec

Please sign in to comment.