Skip to content

Commit

Permalink
[Sync] Initial support of subjective evaluation (#421)
Browse files Browse the repository at this point in the history
Co-authored-by: Leymore <[email protected]>
  • Loading branch information
gaotongxiao and Leymore authored Sep 22, 2023
1 parent 0f2c388 commit a1ea3c0
Show file tree
Hide file tree
Showing 14 changed files with 270 additions and 43 deletions.
17 changes: 17 additions & 0 deletions opencompass/datasets/lmeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import List, Optional

from datasets import Dataset, DatasetDict

from opencompass.datasets import BaseDataset


class LMEvalDataset(BaseDataset):
"""A dataset wrapper around the evaluator inputs, designed for
OpenCompass's internal use."""

@staticmethod
def load(predictions: List, references: Optional[List] = None):
content = {'prediction': predictions}
if references:
content['reference'] = references
return DatasetDict(dict(test=Dataset.from_dict(content)))
6 changes: 4 additions & 2 deletions opencompass/openicl/icl_dataset_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class DatasetReader:
def __init__(self,
dataset: Union[Dataset, DatasetDict, str],
input_columns: Union[List[str], str],
output_column: str,
output_column: Optional[str],
input_template: Optional[PromptTemplate] = None,
output_template: Optional[PromptTemplate] = None,
train_split: str = 'train',
Expand All @@ -68,7 +68,9 @@ def __init__(self,
self.input_columns = _check_type_list(input_columns, [List, str])
if isinstance(self.input_columns, str):
self.input_columns = self.input_columns.split()
self.output_column = _check_str(output_column)
self.output_column = None
if output_column:
self.output_column = _check_str(output_column)

train_range = _check_type_list(train_range, [None, int, float, str])
test_range = _check_type_list(test_range, [None, int, float, str])
Expand Down
1 change: 1 addition & 0 deletions opencompass/openicl/icl_evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .icl_em_evaluator import EMEvaluator # noqa
from .icl_hf_evaluator import * # noqa
from .icl_toxic_evaluator import ToxicEvaluator # noqa
from .lm_evaluator import LMEvaluator # noqa
94 changes: 94 additions & 0 deletions opencompass/openicl/icl_evaluator/lm_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os.path as osp
from typing import Dict, List, Optional

import mmengine
from mmengine.config import ConfigDict

from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.registry import ICL_PROMPT_TEMPLATES
from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
from opencompass.utils.logging import get_logger
from opencompass.utils.text_postprocessors import first_number_postprocess
from opencompass.utils.types import get_type_from_cfg


class LMEvaluator:
"""Evaluate output with language model.
Args:
prompt_template (ConfigDict): Prompt template configuration. Used to
prompt the language model for scores. User can use two reserved
keywords, ``{prediction}`` and ``{reference}``, referring to
the prediction and optionally the reference answer.
judge_cfg (ConfigDict): The config of language model as a judge.
output_path (str): The path to prediction output.
dataset_cfg (ConfigDict, optional): The config of the dataset to be
evaluated.
postprocessor (ConfigDict): The model prediction's postprocessor
config.
"""

def __init__(
self,
prompt_template: ConfigDict,
judge_cfg: ConfigDict,
output_path: str,
dataset_cfg: Optional[ConfigDict] = None,
postprocessor: ConfigDict = dict(type=first_number_postprocess)
) -> None:
self.output_path = output_path
out_dir, out_name = osp.split(output_path)
if not out_dir:
out_dir = './'

self.prompt_tmpl = ICL_PROMPT_TEMPLATES.build(prompt_template)

max_out_len = judge_cfg.get('max_out_len', None)
batch_size = judge_cfg.get('batch_size', None)
model = build_model_from_cfg(model_cfg=judge_cfg)
self.inferencer = GenInferencer(model,
max_out_len=max_out_len,
batch_size=batch_size,
output_json_filepath=out_dir,
output_json_filename=out_name)
self.postprocessor = get_type_from_cfg(postprocessor)
self.logger = get_logger()
self.dataset_cfg = dataset_cfg

def score(self, predictions, references: Optional[List] = None) -> Dict:
if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg)
dataset.reader.dataset['test'] = dataset.test.add_column(
'prediction', predictions)
dataset.reader.input_columns.append('prediction')
if references:
dataset.reader.input_columns.append('reference')
dataset.reader.dataset['test'] = dataset.test.add_column(
'reference', references)
else:
from opencompass.datasets.lmeval import LMEvalDataset
input_columns = ['prediction']
if references:
input_columns.append('reference')
dataset = LMEvalDataset(reader_cfg=dict(
input_columns=input_columns,
output_column=None,
train_split='test'),
predictions=predictions,
references=references)
retriever = ZeroRetriever(dataset)
self.inferencer.inference(retriever=retriever,
prompt_template=self.prompt_tmpl)

output = mmengine.load(self.output_path)
scores = []
for k, v in output.items():
score = self.postprocessor(v['prediction'])
output[k]['score'] = score
scores.append(score)
try:
output['score'] = sum(scores) / len(scores)
except Exception:
pass
return output
39 changes: 35 additions & 4 deletions opencompass/partitioners/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ class BasePartitioner:
Args:
out_dir (str): The output directory of tasks.
keep_keys (List[str]): The keys to be kept from the experiment config
to the task config.
"""

def __init__(self, out_dir: str):
def __init__(self,
out_dir: str,
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
self.logger = get_logger()
self.out_dir = out_dir
self.keep_keys = keep_keys

def __call__(self, cfg: ConfigDict) -> List[Dict]:
"""Generate tasks from config. Each task is defined as a
Expand Down Expand Up @@ -45,7 +50,26 @@ def __call__(self, cfg: ConfigDict) -> List[Dict]:
datasets = cfg['datasets']
work_dir = cfg['work_dir']

tasks = self.partition(models, datasets, work_dir, self.out_dir)
add_cfg = {}
for k in self.keep_keys:
try:
key_chain = k.split('.')
ori_ptr = cfg
tgt_ptr = add_cfg
for key in key_chain[:-1]:
ori_ptr = ori_ptr[key]
if key not in tgt_ptr:
tgt_ptr[key] = {}
tgt_ptr = tgt_ptr[key]
tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
except AttributeError:
self.logger.warning(f'Key {k} not found in config, ignored.')

tasks = self.partition(models,
datasets,
work_dir,
self.out_dir,
add_cfg=add_cfg)

self.logger.info(f'Partitioned into {len(tasks)} tasks.')
for i, task in enumerate(tasks):
Expand All @@ -54,8 +78,12 @@ def __call__(self, cfg: ConfigDict) -> List[Dict]:
return tasks

@abstractmethod
def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
work_dir: str, out_dir: str) -> List[Dict]:
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[Dict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
Expand All @@ -67,6 +95,7 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
'datasets': [[]], # a nested list of dataset configs, each
list corresponds to a model
'work_dir': '', # the work dir
**add_cfg # other keys to be added in the config
}
Args:
Expand All @@ -76,6 +105,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
existency of result file in this directory.
add_cfg (dict): Other common keys to be added in the task config,
used to share the same config among tasks. Defaults to {}.
Returns:
List[Dict]: A list of tasks.
Expand Down
15 changes: 11 additions & 4 deletions opencompass/partitioners/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ class NaivePartitioner(BasePartitioner):
model-dataset pair.
Args:
config (ConfigDict): The full config dict.
out_dir (str): The output directory of tasks.
keep_keys (List[str]): The keys to be kept from the experiment config
to the task config.
"""

def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
work_dir: str, out_dir: str) -> List[Dict]:
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[Dict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
Expand Down Expand Up @@ -54,7 +60,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
task = Config({
'models': [model],
'datasets': [[dataset]],
'work_dir': work_dir
'work_dir': work_dir,
**add_cfg
})
tasks.append(task)
return tasks
29 changes: 21 additions & 8 deletions opencompass/partitioners/size.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import math
import os.path as osp
from fnmatch import fnmatch
from typing import List, Tuple, Union
from typing import Dict, List, Tuple, Union

import mmengine
from mmengine.config import Config, ConfigDict
Expand All @@ -25,20 +25,27 @@ class SizePartitioner(BasePartitioner):
gen_task_coef (int): The dataset cost measurement coefficient for
generation tasks.
dataset_size_path (str): The path to the dataset size cache file.
keep_keys (list[str]): The keys to be kept from the experiment config
to the task config.
"""

def __init__(self,
out_dir: str,
max_task_size: int = 40000,
gen_task_coef: int = 20,
dataset_size_path: str = '.cache/dataset_size.json'):
super().__init__(out_dir)
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.max_task_size = max_task_size
self.gen_task_coef = gen_task_coef
self.dataset_size_path = dataset_size_path

def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
work_dir: str, out_dir: str) -> List[ConfigDict]:
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[ConfigDict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
Expand All @@ -50,6 +57,7 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
'datasets': [[]], # a nested list of dataset configs, each
list corresponds to a model
'work_dir': '', # the work dir
**add_cfg # other keys to be kept in the config
}
Args:
Expand All @@ -59,6 +67,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
existency of result file in this directory.
add_cfg (dict): Other common keys to be added in the task config,
used to share the same config among tasks. Defaults to {}.
Returns:
List[ConfigDict]: A list of tasks.
Expand All @@ -72,7 +82,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
task = Config({
'models': [model],
'datasets': [[]],
'work_dir': work_dir
'work_dir': work_dir,
**add_cfg
})
num_data = 0
for dataset in datasets:
Expand All @@ -91,15 +102,17 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
Config({
'models': [model],
'datasets': [[dataset_split]],
'work_dir': work_dir
'work_dir': work_dir,
**add_cfg
}))
else:
if num_data + dataset_size > self.max_task_size:
tasks.append(task)
task = Config({
'models': [model],
'datasets': [[]],
'work_dir': work_dir
'work_dir': work_dir,
**add_cfg
})
num_data = 0
task['datasets'][0].append(dataset)
Expand Down
11 changes: 4 additions & 7 deletions opencompass/runners/dlc.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
status = [self._launch(task, random_sleep=False) for task in tasks]
return status

def _launch(self, task_cfg: ConfigDict, random_sleep: bool = True):
def _launch(self, cfg: ConfigDict, random_sleep: bool = True):
"""Launch a single task.
Args:
task_cfg (ConfigDict): Task config.
cfg (ConfigDict): Task config.
random_sleep (bool): Whether to sleep for a random time before
running the command. This avoids cluster error when launching
multiple tasks at the same time. Default: True.
Expand All @@ -76,18 +76,15 @@ def _launch(self, task_cfg: ConfigDict, random_sleep: bool = True):
tuple[str, int]: Task name and exit code.
"""

task_type = self.task_cfg.type
if isinstance(self.task_cfg.type, str):
task_type = TASKS.get(task_type)
task = task_type(task_cfg)
task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type']))
num_gpus = task.num_gpus
task_name = task.name

# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
try:
task_cfg.dump(param_file)
cfg.dump(param_file)

# Build up DLC command
pwd = os.getcwd()
Expand Down
4 changes: 2 additions & 2 deletions opencompass/runners/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
status = []
if self.debug:
for task in tasks:
task = TASKS.build(dict(type=self.task_cfg.type, cfg=task))
task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
task_name = task.name
# get cmd
mmengine.mkdir_or_exist('tmp/')
Expand Down Expand Up @@ -94,7 +94,7 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
lock = Lock()

def submit(task, index):
task = TASKS.build(dict(type=self.task_cfg.type, cfg=task))
task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
num_gpus = task.num_gpus
assert len(gpus) >= num_gpus

Expand Down
Loading

0 comments on commit a1ea3c0

Please sign in to comment.