mosaicml · bmosaicml · Aug 29, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 16, 2023
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 try:
+    from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet
     from llmfoundry.callbacks.fdiff_callback import FDiffMetrics
     from llmfoundry.callbacks.generate_callback import Generate
-    from llmfoundry.callbacks.model_gauntlet_callback import ModelGauntlet
     from llmfoundry.callbacks.monolithic_ckpt_callback import \
         MonolithicCheckpointSaver
     from llmfoundry.callbacks.resumption_callbacks import (GlobalLRScaling,
@@ -23,5 +23,5 @@
     'GlobalLRScaling',
     'LayerFreezing',
     'ScheduledGarbageCollector',
-    'ModelGauntlet',
+    'EvalGauntlet',
 ]
@@ -4,14 +4,13 @@
 """Aggregate ICL evals into composite scores."""
 
 import math
-import re
 from enum import Enum
 from typing import Optional
 
 from composer.core import Callback, State
 from composer.loggers import Logger
 
-__all__ = ['ModelGauntlet']
+__all__ = ['EvalGauntlet']
 
 
 class Weighting(Enum):
@@ -21,17 +20,32 @@ class Weighting(Enum):
 
 
 class ModelGauntlet(Callback):
-    """The ModelGauntlet aggregates ICL eval results.
+    """The ModelGauntlet callback has been renamed to EvalGauntlet.
+
+    We've created this dummy class, in order to alert anyone who may have been
+    importing ModelGauntlet.
+    """
+
+    def __init__(
+        self,
+        *args,  # pyright: ignore [reportMissingParameterType]
+        **kwargs):  # pyright: ignore [reportMissingParameterType]
+        raise ImportError(
+            'ModelGauntlet class is deprecated, please use EvalGauntlet')
+
+
+class EvalGauntlet(Callback):
+    """The EvalGauntlet aggregates ICL eval results.
 
     After `eval_end`, this callback inspects the logger for different ICL metrics and aggregates the scores according to the aggregation
     specification provided in the constructor.
 
     Args:
-        logger_keys (dict): These are the exact keys that the individual benchmark metrics will be
+        logger_keys (list): These are the exact keys that the individual benchmark metrics will be
                             logged under in the logger after eval
         tasks (dict): This contains the list of categories, as well as the subtasks within them, the
                       random baseline accuracy of each subtask, and the number of fewshot examples
-                      used for the task. See `llmfoundry/scripts/eval/yamls/model_gauntlet.yaml` to see the structure.
+                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
         weighting (Weighting): The weighting scheme used to balance different tasks within each category.
                                Either assign them all equal weight, assign them weight proportional
                                to the dataset size, or assign them weight proportional to the log2 of the dataset size.
@@ -44,12 +58,15 @@ class ModelGauntlet(Callback):
     """
 
     def __init__(self,
-                 logger_keys: dict,
+                 logger_keys: list,
                  categories: dict,
                  weighting: str = 'EQUAL',
                  subtract_random_baseline: bool = True,
                  rescale_accuracy: bool = True,
                  benchmark_sizes: Optional[dict] = None):
+        if isinstance(logger_keys, dict):
+            raise ValueError(
+                'logger_keys now requires a list type as input, not a dict')
         if weighting != Weighting.EQUAL and benchmark_sizes is None:
             raise Exception(
                 'When not using equal weighting, you must provide the benchmark sizes.'
@@ -90,56 +107,50 @@ def __init__(self,
                 assert weight is not None
                 benchmark['weighting'] = weight
 
-    def compute_averages(self, logger_data: Logger):
-
+    def compute_averages(self, state: State):
         results = {}
-        pat = re.compile(
-            'metrics/(.*?)/(\d+)-shot(/.*?)?/InContextLearning(.*)'  # type: ignore
-        )
+
         for key in self.logger_keys:
-            match = pat.match(key)
-
-            # TODO(bmosaicml) This needs to be factored for this callback to work as a normal callback
-            # and therefore for the typing to be fixed
-            val = logger_data.data[key][0][1].item()  # type: ignore
-
-            if match:
-                eval_name = match.group(1)
-                num_shot = match.group(2)
-                subcat = match.group(3)
-                metric = match.group(4)
-
-                if subcat is not None:
-                    subcat = subcat[1:]
-                    if f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}' not in results:
-                        results[f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'] = []
-                    results[
-                        f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'].append(
-                            val)
-                else:
-                    results[key] = [val]
+
+            # starting at index 1 skips the "metric" part of the key which is superfluous
+            dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1]
+            if 'Accuracy' not in metric_name:
+                continue
+
+            metric = state.eval_metrics.get('/'.join(dl_name),
+                                            {}).get(metric_name, None)
+            if metric is None:
+                continue
+            val = metric.compute().item()
+
+            # ending at index 2 allows us to aggregate over dataloaders w/ subcategories
+            key = '/'.join(dl_name[0:2])
+            if key not in results:
+                results[key] = []
+
+            results[key].append(val)
+
         return {k: sum(v) / len(v) for k, v in results.items()}
 
-    def eval_end(self, state: State, logger: Logger):
-        new_metrics = self.compute_averages(logger)
+    def eval_after_all(self, state: State, logger: Logger):
+        new_metrics = self.compute_averages(state)
+        if len(new_metrics) == 0:
+            return {}
         composite_scores = {}
+
         for category in self.categories:
+            missing_metrics = []
             composite_scores[category['name']] = []
             for benchmark in category['benchmarks']:
-                key_pat = re.compile(
-                    f"metrics/{benchmark['name']}/{benchmark['num_fewshot']}-shot/.*Accuracy"
-                )
+                key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"
 
-                matching_key = [
-                    k for k in new_metrics.keys()
-                    if key_pat.match(k) is not None
-                ]
-                if len(matching_key) == 0:
+                if key not in new_metrics:
                     print(
                         f"Warning: couldn't find results for benchmark: {benchmark}"
                     )
+                    missing_metrics.append(key)
                 else:
-                    score = new_metrics[matching_key[0]]
+                    score = new_metrics[key]
 
                     if self.subtract_random_baseline:
                         score -= benchmark['random_baseline']
@@ -152,19 +163,28 @@ def eval_end(self, state: State, logger: Logger):
                         'score': score,
                         'weighting': benchmark['weighting']
                     })
+
+            if len(missing_metrics) > 0:
+                print(
+                    f"Removing category `{category['name']}` from gauntlet scores because benchmarks were missing: {missing_metrics}"
+                )
+                del composite_scores[category['name']]
+                continue
             total_weight = sum(
                 k['weighting'] for k in composite_scores[category['name']])
             composite_scores[category['name']] = sum(
                 k['score'] * (k['weighting'] / total_weight)
                 for k in composite_scores[category['name']])
 
         composite_scores = {
-            f'metrics/model_gauntlet/{k}': v
+            f'icl/metrics/eval_gauntlet/{k}': v
             for k, v in composite_scores.items()
         }
 
-        composite_scores['metrics/model_gauntlet/average'] = sum(
-            composite_scores.values()) / len(composite_scores.values())
-        logger.log_metrics(composite_scores)
+        composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
+            composite_scores.values()) / len(composite_scores.values()) if len(
+                composite_scores.values()) > 0 else 0
+        if logger is not None:
+            logger.log_metrics(composite_scores)
 
         return composite_scores
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from composer import algorithms
@@ -12,7 +12,8 @@
 from composer.core import Evaluator
 from composer.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
-from composer.loggers import MLFlowLogger, TensorboardLogger, WandBLogger
+from composer.loggers import (InMemoryLogger, MLFlowLogger, TensorboardLogger,
+                              WandBLogger)
 from composer.optim import DecoupledAdamW
 from composer.optim.scheduler import (ConstantWithWarmupScheduler,
                                       CosineAnnealingWithWarmupScheduler,
@@ -22,13 +23,48 @@
 from omegaconf import OmegaConf as om
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from llmfoundry.callbacks import (FDiffMetrics, Generate, GlobalLRScaling,
-                                  LayerFreezing, MonolithicCheckpointSaver,
+from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, Generate,
+                                  GlobalLRScaling, LayerFreezing,
+                                  MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
                               DecoupledLionW, DecoupledLionW_8bit)
 
 
+def build_icl_data_and_gauntlet(
+    icl_tasks_config: Union[str, ListConfig],
+    eval_gauntlet_config: Optional[Union[str, DictConfig]],
+    tokenizer: AutoTokenizer,
+    device_eval_batch_size: int,
+    icl_seq_len: int,
+    icl_subset_num_batches: Optional[int] = None
+) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:
+    icl_evaluators, logger_keys = build_icl_evaluators(
+        icl_tasks_config,
+        tokenizer,
+        icl_seq_len,
+        device_eval_batch_size,
+        icl_subset_num_batches=icl_subset_num_batches)
+    eval_gauntlet_cb = None
+    if eval_gauntlet_config is not None:
+        if isinstance(eval_gauntlet_config, str):
+            with open(eval_gauntlet_config, 'r') as icl_f:
+                eval_gauntlet_cfg = om.load(icl_f)
+            eval_gauntlet = eval_gauntlet_cfg.eval_gauntlet
+        elif isinstance(eval_gauntlet_config, DictConfig):  # pyright: ignore
+            eval_gauntlet = eval_gauntlet_config
+        else:
+            raise ValueError(
+                f'Got invalid type for eval_gauntlet_config: {type(eval_gauntlet_config)}'
+            )
+        eval_gauntlet.logger_keys = logger_keys
+        eval_gauntlet.benchmark_sizes = {
+            e.label: e.dataloader.num_samples for e in icl_evaluators
+        }
+        eval_gauntlet_cb = EvalGauntlet(**eval_gauntlet)
+    return icl_evaluators, logger_keys, eval_gauntlet_cb
+
+
 def build_callback(name: str, kwargs: Dict[str, Any]):
     if name == 'lr_monitor':
         return LRMonitor()
@@ -69,6 +105,8 @@ def build_logger(name: str, kwargs: Dict[str, Any]):
         return TensorboardLogger(**kwargs)
     elif name == 'mlflow':
         return MLFlowLogger(**kwargs)
+    elif name == 'inmemory':
+        return InMemoryLogger(**kwargs)
     else:
         raise ValueError(f'Not sure how to build logger: {name}')
 
@@ -138,13 +176,17 @@ def build_tokenizer(om_tokenizer_config: DictConfig) -> PreTrainedTokenizerBase:
     return tokenizer
 
 
-def build_icl_evaluators(icl_tasks: Union[str, ListConfig],
-                         tokenizer: PreTrainedTokenizerBase,
-                         default_max_seq_len: int,
-                         default_batch_size: int,
-                         destination_dir: Optional[str] = None):
+def build_icl_evaluators(
+    icl_tasks: Union[str, ListConfig],
+    tokenizer: AutoTokenizer,
+    default_max_seq_len: int,
+    default_batch_size: int,
+    destination_dir: Optional[str] = None,
+    icl_subset_num_batches: Optional[int] = None,
+):
     if destination_dir is None:
         destination_dir = os.getcwd()
+
     evaluators = []
     logger_keys = []
 
@@ -193,6 +235,7 @@ def _validate_cfg(icl_cfg: DictConfig):
             icl_cfg.batch_size = default_batch_size
 
     for icl_cfg in icl_tasks_list:
+        assert isinstance(icl_cfg, DictConfig)
         _validate_cfg(icl_cfg)
         for num_fewshot in list(icl_cfg.num_fewshot):
             if tokenizer.pad_token_id is None:
@@ -240,6 +283,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                 evaluators.append(
                     Evaluator(label=label,
                               dataloader=dataloaders,
-                              metric_names=metric_names),)
+                              metric_names=metric_names,
+                              subset_num_batches=icl_subset_num_batches))
 
     return evaluators, logger_keys