[Sync] Initial support of subjective evaluation (#421)

Co-authored-by: Leymore <[email protected]>
open-compass · Sep 22, 2023 · a1ea3c0 · a1ea3c0
1 parent 0f2c388
commit a1ea3c0
Show file tree

Hide file tree

Showing 14 changed files with 270 additions and 43 deletions.
diff --git a/opencompass/datasets/lmeval.py b/opencompass/datasets/lmeval.py
@@ -0,0 +1,17 @@
+from typing import List, Optional
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.datasets import BaseDataset
+
+
+class LMEvalDataset(BaseDataset):
+    """A dataset wrapper around the evaluator inputs, designed for
+    OpenCompass's internal use."""
+
+    @staticmethod
+    def load(predictions: List, references: Optional[List] = None):
+        content = {'prediction': predictions}
+        if references:
+            content['reference'] = references
+        return DatasetDict(dict(test=Dataset.from_dict(content)))
diff --git a/opencompass/openicl/icl_dataset_reader.py b/opencompass/openicl/icl_dataset_reader.py
@@ -58,7 +58,7 @@ class DatasetReader:
     def __init__(self,
                  dataset: Union[Dataset, DatasetDict, str],
                  input_columns: Union[List[str], str],
-                 output_column: str,
+                 output_column: Optional[str],
                  input_template: Optional[PromptTemplate] = None,
                  output_template: Optional[PromptTemplate] = None,
                  train_split: str = 'train',
@@ -68,7 +68,9 @@ def __init__(self,
         self.input_columns = _check_type_list(input_columns, [List, str])
         if isinstance(self.input_columns, str):
             self.input_columns = self.input_columns.split()
-        self.output_column = _check_str(output_column)
+        self.output_column = None
+        if output_column:
+            self.output_column = _check_str(output_column)
 
         train_range = _check_type_list(train_range, [None, int, float, str])
         test_range = _check_type_list(test_range, [None, int, float, str])

diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py
@@ -4,3 +4,4 @@
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
+from .lm_evaluator import LMEvaluator  # noqa
diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@@ -0,0 +1,94 @@
+import os.path as osp
+from typing import Dict, List, Optional
+
+import mmengine
+from mmengine.config import ConfigDict
+
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.registry import ICL_PROMPT_TEMPLATES
+from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
+from opencompass.utils.logging import get_logger
+from opencompass.utils.text_postprocessors import first_number_postprocess
+from opencompass.utils.types import get_type_from_cfg
+
+
+class LMEvaluator:
+    """Evaluate output with language model.
+
+    Args:
+        prompt_template (ConfigDict): Prompt template configuration. Used to
+            prompt the language model for scores. User can use two reserved
+            keywords, ``{prediction}`` and ``{reference}``, referring to
+            the prediction and optionally the reference answer.
+        judge_cfg (ConfigDict): The config of language model as a judge.
+        output_path (str): The path to prediction output.
+        dataset_cfg (ConfigDict, optional): The config of the dataset to be
+            evaluated.
+        postprocessor (ConfigDict): The model prediction's postprocessor
+            config.
+    """
+
+    def __init__(
+        self,
+        prompt_template: ConfigDict,
+        judge_cfg: ConfigDict,
+        output_path: str,
+        dataset_cfg: Optional[ConfigDict] = None,
+        postprocessor: ConfigDict = dict(type=first_number_postprocess)
+    ) -> None:
+        self.output_path = output_path
+        out_dir, out_name = osp.split(output_path)
+        if not out_dir:
+            out_dir = './'
+
+        self.prompt_tmpl = ICL_PROMPT_TEMPLATES.build(prompt_template)
+
+        max_out_len = judge_cfg.get('max_out_len', None)
+        batch_size = judge_cfg.get('batch_size', None)
+        model = build_model_from_cfg(model_cfg=judge_cfg)
+        self.inferencer = GenInferencer(model,
+                                        max_out_len=max_out_len,
+                                        batch_size=batch_size,
+                                        output_json_filepath=out_dir,
+                                        output_json_filename=out_name)
+        self.postprocessor = get_type_from_cfg(postprocessor)
+        self.logger = get_logger()
+        self.dataset_cfg = dataset_cfg
+
+    def score(self, predictions, references: Optional[List] = None) -> Dict:
+        if self.dataset_cfg:
+            dataset = build_dataset_from_cfg(self.dataset_cfg)
+            dataset.reader.dataset['test'] = dataset.test.add_column(
+                'prediction', predictions)
+            dataset.reader.input_columns.append('prediction')
+            if references:
+                dataset.reader.input_columns.append('reference')
+                dataset.reader.dataset['test'] = dataset.test.add_column(
+                    'reference', references)
+        else:
+            from opencompass.datasets.lmeval import LMEvalDataset
+            input_columns = ['prediction']
+            if references:
+                input_columns.append('reference')
+            dataset = LMEvalDataset(reader_cfg=dict(
+                input_columns=input_columns,
+                output_column=None,
+                train_split='test'),
+                                    predictions=predictions,
+                                    references=references)
+        retriever = ZeroRetriever(dataset)
+        self.inferencer.inference(retriever=retriever,
+                                  prompt_template=self.prompt_tmpl)
+
+        output = mmengine.load(self.output_path)
+        scores = []
+        for k, v in output.items():
+            score = self.postprocessor(v['prediction'])
+            output[k]['score'] = score
+            scores.append(score)
+        try:
+            output['score'] = sum(scores) / len(scores)
+        except Exception:
+            pass
+        return output
diff --git a/opencompass/partitioners/base.py b/opencompass/partitioners/base.py
@@ -13,11 +13,16 @@ class BasePartitioner:
 
     Args:
         out_dir (str): The output directory of tasks.
+        keep_keys (List[str]): The keys to be kept from the experiment config
+            to the task config.
     """
 
-    def __init__(self, out_dir: str):
+    def __init__(self,
+                 out_dir: str,
+                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
         self.logger = get_logger()
         self.out_dir = out_dir
+        self.keep_keys = keep_keys
 
     def __call__(self, cfg: ConfigDict) -> List[Dict]:
         """Generate tasks from config. Each task is defined as a
@@ -45,7 +50,26 @@ def __call__(self, cfg: ConfigDict) -> List[Dict]:
         datasets = cfg['datasets']
         work_dir = cfg['work_dir']
 
-        tasks = self.partition(models, datasets, work_dir, self.out_dir)
+        add_cfg = {}
+        for k in self.keep_keys:
+            try:
+                key_chain = k.split('.')
+                ori_ptr = cfg
+                tgt_ptr = add_cfg
+                for key in key_chain[:-1]:
+                    ori_ptr = ori_ptr[key]
+                    if key not in tgt_ptr:
+                        tgt_ptr[key] = {}
+                    tgt_ptr = tgt_ptr[key]
+                tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
+            except AttributeError:
+                self.logger.warning(f'Key {k} not found in config, ignored.')
+
+        tasks = self.partition(models,
+                               datasets,
+                               work_dir,
+                               self.out_dir,
+                               add_cfg=add_cfg)
 
         self.logger.info(f'Partitioned into {len(tasks)} tasks.')
         for i, task in enumerate(tasks):
@@ -54,8 +78,12 @@ def __call__(self, cfg: ConfigDict) -> List[Dict]:
         return tasks
 
     @abstractmethod
-    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
-                  work_dir: str, out_dir: str) -> List[Dict]:
+    def partition(self,
+                  models: List[ConfigDict],
+                  datasets: List[ConfigDict],
+                  work_dir: str,
+                  out_dir: str,
+                  add_cfg: Dict = {}) -> List[Dict]:
         """Partition model-dataset pairs into tasks. Each task is defined as a
         dict and will run independently as a unit. Its structure is as
         follows:
@@ -67,6 +95,7 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
                 'datasets': [[]],  # a nested list of dataset configs, each
                                     list corresponds to a model
                 'work_dir': '',  # the work dir
+                **add_cfg  # other keys to be added in the config
             }
 
         Args:
@@ -76,6 +105,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
             out_dir (str): The full output path for the task, intended for
                 Partitioners to check whether the task is finished via the
                 existency of result file in this directory.
+            add_cfg (dict): Other common keys to be added in the task config,
+                used to share the same config among tasks. Defaults to {}.
 
         Returns:
             List[Dict]: A list of tasks.

diff --git a/opencompass/partitioners/naive.py b/opencompass/partitioners/naive.py
@@ -15,11 +15,17 @@ class NaivePartitioner(BasePartitioner):
     model-dataset pair.
 
     Args:
-        config (ConfigDict): The full config dict.
+        out_dir (str): The output directory of tasks.
+        keep_keys (List[str]): The keys to be kept from the experiment config
+            to the task config.
     """
 
-    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
-                  work_dir: str, out_dir: str) -> List[Dict]:
+    def partition(self,
+                  models: List[ConfigDict],
+                  datasets: List[ConfigDict],
+                  work_dir: str,
+                  out_dir: str,
+                  add_cfg: Dict = {}) -> List[Dict]:
         """Partition model-dataset pairs into tasks. Each task is defined as a
         dict and will run independently as a unit. Its structure is as
         follows:
@@ -54,7 +60,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
                 task = Config({
                     'models': [model],
                     'datasets': [[dataset]],
-                    'work_dir': work_dir
+                    'work_dir': work_dir,
+                    **add_cfg
                 })
                 tasks.append(task)
         return tasks
diff --git a/opencompass/partitioners/size.py b/opencompass/partitioners/size.py
@@ -2,7 +2,7 @@
 import math
 import os.path as osp
 from fnmatch import fnmatch
-from typing import List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import mmengine
 from mmengine.config import Config, ConfigDict
@@ -25,20 +25,27 @@ class SizePartitioner(BasePartitioner):
         gen_task_coef (int): The dataset cost measurement coefficient for
             generation tasks.
         dataset_size_path (str): The path to the dataset size cache file.
+        keep_keys (list[str]): The keys to be kept from the experiment config
+            to the task config.
     """
 
     def __init__(self,
                  out_dir: str,
                  max_task_size: int = 40000,
                  gen_task_coef: int = 20,
-                 dataset_size_path: str = '.cache/dataset_size.json'):
-        super().__init__(out_dir)
+                 dataset_size_path: str = '.cache/dataset_size.json',
+                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
         self.max_task_size = max_task_size
         self.gen_task_coef = gen_task_coef
         self.dataset_size_path = dataset_size_path
 
-    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
-                  work_dir: str, out_dir: str) -> List[ConfigDict]:
+    def partition(self,
+                  models: List[ConfigDict],
+                  datasets: List[ConfigDict],
+                  work_dir: str,
+                  out_dir: str,
+                  add_cfg: Dict = {}) -> List[ConfigDict]:
         """Partition model-dataset pairs into tasks. Each task is defined as a
         dict and will run independently as a unit. Its structure is as
         follows:
@@ -50,6 +57,7 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
                 'datasets': [[]],  # a nested list of dataset configs, each
                                     list corresponds to a model
                 'work_dir': '',  # the work dir
+                **add_cfg  # other keys to be kept in the config
             }
 
         Args:
@@ -59,6 +67,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
             out_dir (str): The full output path for the task, intended for
                 Partitioners to check whether the task is finished via the
                 existency of result file in this directory.
+            add_cfg (dict): Other common keys to be added in the task config,
+                used to share the same config among tasks. Defaults to {}.
 
         Returns:
             List[ConfigDict]: A list of tasks.
@@ -72,7 +82,8 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
             task = Config({
                 'models': [model],
                 'datasets': [[]],
-                'work_dir': work_dir
+                'work_dir': work_dir,
+                **add_cfg
             })
             num_data = 0
             for dataset in datasets:
@@ -91,15 +102,17 @@ def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
                                 Config({
                                     'models': [model],
                                     'datasets': [[dataset_split]],
-                                    'work_dir': work_dir
+                                    'work_dir': work_dir,
+                                    **add_cfg
                                 }))
                 else:
                     if num_data + dataset_size > self.max_task_size:
                         tasks.append(task)
                         task = Config({
                             'models': [model],
                             'datasets': [[]],
-                            'work_dir': work_dir
+                            'work_dir': work_dir,
+                            **add_cfg
                         })
                         num_data = 0
                     task['datasets'][0].append(dataset)

diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
@@ -63,11 +63,11 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
             status = [self._launch(task, random_sleep=False) for task in tasks]
         return status
 
-    def _launch(self, task_cfg: ConfigDict, random_sleep: bool = True):
+    def _launch(self, cfg: ConfigDict, random_sleep: bool = True):
         """Launch a single task.
 
         Args:
-            task_cfg (ConfigDict): Task config.
+            cfg (ConfigDict): Task config.
             random_sleep (bool): Whether to sleep for a random time before
                 running the command. This avoids cluster error when launching
                 multiple tasks at the same time. Default: True.
@@ -76,18 +76,15 @@ def _launch(self, task_cfg: ConfigDict, random_sleep: bool = True):
             tuple[str, int]: Task name and exit code.
         """
 
-        task_type = self.task_cfg.type
-        if isinstance(self.task_cfg.type, str):
-            task_type = TASKS.get(task_type)
-        task = task_type(task_cfg)
+        task = TASKS.build(dict(cfg=cfg, type=self.task_cfg['type']))
         num_gpus = task.num_gpus
         task_name = task.name
 
         # Dump task config to file
         mmengine.mkdir_or_exist('tmp/')
         param_file = f'tmp/{os.getpid()}_params.py'
         try:
-            task_cfg.dump(param_file)
+            cfg.dump(param_file)
 
             # Build up DLC command
             pwd = os.getcwd()

diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py
@@ -57,7 +57,7 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
         status = []
         if self.debug:
             for task in tasks:
-                task = TASKS.build(dict(type=self.task_cfg.type, cfg=task))
+                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                 task_name = task.name
                 # get cmd
                 mmengine.mkdir_or_exist('tmp/')
@@ -94,7 +94,7 @@ def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
             lock = Lock()
 
             def submit(task, index):
-                task = TASKS.build(dict(type=self.task_cfg.type, cfg=task))
+                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                 num_gpus = task.num_gpus
                 assert len(gpus) >= num_gpus