-
Notifications
You must be signed in to change notification settings - Fork 435
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feat] support humaneval and mbpp pass@k (#598)
* [Feat] support pass@ k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k * [Feat] support pass@k docs * update naming --------- Co-authored-by: Leymore <[email protected]>
- Loading branch information
Showing
13 changed files
with
622 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# This config is used for pass@k evaluation with `num_return_sequences` | ||
# That model can generate multiple responses for single input | ||
from mmengine.config import read_base | ||
from opencompass.partitioners import SizePartitioner | ||
from opencompass.models import HuggingFaceCausalLM | ||
from opencompass.runners import LocalRunner | ||
from opencompass.partitioners import SizePartitioner | ||
from opencompass.tasks import OpenICLInferTask | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
models = [ | ||
dict( | ||
type=HuggingFaceCausalLM, | ||
abbr='CodeLlama-7b-Python', | ||
path="codellama/CodeLlama-7b-Python-hf", | ||
tokenizer_path='codellama/CodeLlama-7b-Python-hf', | ||
tokenizer_kwargs=dict( | ||
padding_side='left', | ||
truncation_side='left', | ||
trust_remote_code=True, | ||
), | ||
max_out_len=1024, | ||
max_seq_len=2048, | ||
batch_size=8, | ||
model_kwargs=dict(trust_remote_code=True, device_map='auto'), | ||
generation_kwargs=dict( | ||
num_return_sequences=10, | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
run_cfg=dict(num_gpus=1, num_procs=1), | ||
), | ||
] | ||
|
||
|
||
infer = dict( | ||
partitioner=dict(type=SizePartitioner, max_task_size=300), | ||
runner=dict( | ||
type=LocalRunner, max_num_workers=16, | ||
task=dict(type=OpenICLInferTask)), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# This config is used for pass@k evaluation with dataset repetition | ||
# That model cannot generate multiple response for single input | ||
from mmengine.config import read_base | ||
from opencompass.partitioners import SizePartitioner | ||
from opencompass.models import HuggingFaceCausalLM | ||
from opencompass.runners import LocalRunner | ||
from opencompass.partitioners import SizePartitioner | ||
from opencompass.tasks import OpenICLInferTask | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' | ||
humaneval_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['abbr'] = 'mbpp_pass10' | ||
mbpp_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
_meta_template = dict( | ||
round=[ | ||
dict(role="HUMAN", begin="<|User|>:", end="\n"), | ||
dict(role="BOT", begin="<|Bot|>:", end="<eoa>\n", generate=True), | ||
], | ||
) | ||
|
||
models = [ | ||
dict( | ||
abbr="internlm-chat-7b-hf-v11", | ||
type=HuggingFaceCausalLM, | ||
path="internlm/internlm-chat-7b-v1_1", | ||
tokenizer_path="internlm/internlm-chat-7b-v1_1", | ||
tokenizer_kwargs=dict( | ||
padding_side="left", | ||
truncation_side="left", | ||
use_fast=False, | ||
trust_remote_code=True, | ||
), | ||
max_seq_len=2048, | ||
meta_template=_meta_template, | ||
model_kwargs=dict(trust_remote_code=True, device_map="auto"), | ||
generation_kwargs=dict( | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
run_cfg=dict(num_gpus=1, num_procs=1), | ||
batch_size=8, | ||
) | ||
] | ||
|
||
|
||
infer = dict( | ||
partitioner=dict(type=SizePartitioner, max_task_size=600), | ||
runner=dict( | ||
type=LocalRunner, max_num_workers=16, | ||
task=dict(type=OpenICLInferTask)), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# Code Evaluation Tutorial | ||
|
||
This tutorial primarily focuses on evaluating a model's coding proficiency, using `humaneval` and `mbpp` as examples. | ||
|
||
## pass@1 | ||
|
||
If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md). | ||
|
||
For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md). | ||
|
||
## pass@k | ||
|
||
If you need to generate multiple responses for a single example to evaluate the pass@k performance, consider the following two situations. Here we take 10 responses as an example: | ||
|
||
### Typical Situation | ||
|
||
For most models that support the `num_return_sequences` parameter in HF's generation, we can use it directly to obtain multiple responses. Refer to the following configuration file: | ||
|
||
```python | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
models = [ | ||
dict( | ||
type=HuggingFaceCausalLM, | ||
..., | ||
generation_kwargs=dict( | ||
num_return_sequences=10, | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
..., | ||
) | ||
] | ||
``` | ||
|
||
For `mbpp`, new changes are needed in the dataset and evaluation, so we simultaneously modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields to accommodate these requirements. | ||
|
||
We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary. Note that we need to set `num_return_sequences` to get the number of responses. | ||
|
||
Note: `num_return_sequences` must be greater than or equal to k, as pass@k itself is a probability estimate. | ||
|
||
You can specifically refer to the following configuration file [configs/eval_code_passk.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk.py) | ||
|
||
### For Models That Do Not Support Multiple Responses | ||
|
||
This applies to some HF models with poorly designed APIs or missing features. In this case, we need to repeatedly construct datasets to achieve multiple response effects. Refer to the following configuration: | ||
|
||
```python | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' | ||
humaneval_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['abbr'] = 'mbpp_pass10' | ||
mbpp_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
models = [ | ||
dict( | ||
type=HuggingFaceCausalLM, | ||
..., | ||
generation_kwargs=dict( | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
..., | ||
) | ||
] | ||
``` | ||
|
||
Since the dataset's prompt has not been modified, we need to replace the corresponding fields to achieve the purpose of repeating the dataset. | ||
You need to modify these fields: | ||
|
||
- `num_repeats`: the number of times the dataset is repeated | ||
- `abbr`: It's best to modify the dataset abbreviation along with the number of repetitions because the number of datasets will change, preventing potential issues arising from discrepancies with the values in `.cache/dataset_size.json`. | ||
|
||
For `mbpp`, modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields as well. | ||
|
||
We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary. | ||
|
||
You can specifically refer to the following configuration file [configs/eval_code_passk_repeat_dataset.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk_repeat_dataset.py) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# 代码评测教程 | ||
|
||
这里以 `humaneval` 和 `mbpp` 为例,主要介绍如何评测模型的代码能力。 | ||
|
||
## pass@1 | ||
|
||
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。 | ||
|
||
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。 | ||
|
||
## pass@k | ||
|
||
如果对于单个example需要生成多条回复来评测pass@k的性能,需要参考以下两种情况。这里以10回复为例子: | ||
|
||
### 通常情况 | ||
|
||
对于绝大多数模型来说,模型支持HF的generation中带有`num_return_sequences` 参数,我们可以直接使用来获取多回复。可以参考以下配置文件。 | ||
|
||
```python | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
models = [ | ||
dict( | ||
type=HuggingFaceCausalLM, | ||
..., | ||
generation_kwargs=dict( | ||
num_return_sequences=10, | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
..., | ||
) | ||
] | ||
``` | ||
|
||
对于 `mbpp`,在数据集和评测上需要有新的变更,所以同步修改`type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` 字段来适应新的需求。 | ||
|
||
另外我们需要模型的回复有随机性,同步需要设置`generation_kwargs`参数。这里注意要设置`num_return_sequences`得到回复数。 | ||
|
||
注意:`num_return_sequences` 必须大于等于k,本身pass@k是计算的概率估计。 | ||
|
||
具体可以参考以下配置文件 | ||
[configs/eval_code_passk.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk.py) | ||
|
||
### 模型不支持多回复 | ||
|
||
适用于一些没有设计好的API以及功能缺失的HF模型。这个时候我们需要重复构造数据集来达到多回复的效果。这里可以参考以下配置文件。 | ||
|
||
```python | ||
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator | ||
|
||
with read_base(): | ||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets | ||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets | ||
|
||
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' | ||
humaneval_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['abbr'] = 'mbpp_pass10' | ||
mbpp_datasets[0]['num_repeats'] = 10 | ||
mbpp_datasets[0]['type'] = MBPPDataset_V2 | ||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator | ||
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column' | ||
|
||
datasets = [] | ||
datasets += humaneval_datasets | ||
datasets += mbpp_datasets | ||
|
||
models = [ | ||
dict( | ||
type=HuggingFaceCausalLM, | ||
..., | ||
generation_kwargs=dict( | ||
do_sample=True, | ||
top_p=0.95, | ||
temperature=0.8, | ||
), | ||
..., | ||
) | ||
] | ||
``` | ||
|
||
由于数据集的prompt并没有修改,我们需要替换对应的字段来达到数据集重复的目的。 | ||
需要修改以下字段: | ||
|
||
- `num_repeats`: 数据集重复的次数 | ||
- `abbr`: 数据集的缩写最好随着重复次数一并修改,因为数据集数量会发生变化,防止与`.cache/dataset_size.json` 中的数值出现差异导致一些潜在的问题。 | ||
|
||
对于 `mbpp`,同样修改`type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` 字段。 | ||
|
||
另外我们需要模型的回复有随机性,同步需要设置`generation_kwargs`参数。 | ||
|
||
具体可以参考以下配置文件 | ||
[configs/eval_code_passk_repeat_dataset.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk_repeat_dataset.py) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.