From 554bc464022a53024946ae9e7c1aba7b8421a597 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 28 Mar 2024 14:38:33 -0700
Subject: [PATCH] Output eval logging batch (#961)

* Skip flaky lion8b test (#598)

* relax atol and add retries to reduce flakiness in lion8b timing test

* add eval output logging

* add back tasks

* foo

* add rlhf prompts

* add rlhf prompts

* add rlhf prompts

* add rlhf prompts

* add rlhf prompts

* fix prompt

* fix prompt

* modify mcli

* test

* test

* fix

* fix merge

* wip

* merge

* reset files, wip commit

* rm small changes

* reduce changes

* reduce changes

* .

* wip

* rm batch keys

* revert init device

* linting

* add import

* fix import

* add eval_output_logging to registry

* readd import

* pyright + linting

---------

Co-authored-by: dblalock <davis@mosaicml.com>
Co-authored-by: Jeremy Dohmann <jeremy@mosaicml.com>
---
 llmfoundry/callbacks/__init__.py |  7 ++++---
 scripts/eval/eval.py             | 19 ++++++++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py
index aaba90eeec..d9bb3c24a7 100644
--- a/llmfoundry/callbacks/__init__.py
+++ b/llmfoundry/callbacks/__init__.py
@@ -1,9 +1,9 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from composer.callbacks import (EarlyStopper, Generate, LRMonitor,
-                                MemoryMonitor, MemorySnapshot, OOMObserver,
-                                OptimizerMonitor, RuntimeEstimator,
+from composer.callbacks import (EarlyStopper, EvalOutputLogging, Generate,
+                                LRMonitor, MemoryMonitor, MemorySnapshot,
+                                OOMObserver, OptimizerMonitor, RuntimeEstimator,
                                 SpeedMonitor)
 
 from llmfoundry.callbacks.async_eval_callback import AsyncEval
@@ -33,6 +33,7 @@
 callbacks.register('mono_checkpoint_saver', func=MonolithicCheckpointSaver)
 callbacks.register('scheduled_gc', func=ScheduledGarbageCollector)
 callbacks.register('oom_observer', func=OOMObserver)
+callbacks.register('eval_output_logging', func=EvalOutputLogging)
 
 callbacks_with_config.register('async_eval', func=AsyncEval)
 callbacks_with_config.register('curriculum_learning', func=CurriculumLearning)
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 961b50e254..22108d4c75 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -11,6 +11,7 @@
 
 import pandas as pd
 import torch
+from composer.core import Callback
 from composer.loggers.logger_destination import LoggerDestination
 from composer.trainer import Trainer
 from composer.utils import dist, get_device, reproducibility
@@ -23,8 +24,9 @@
 
 install()
 from llmfoundry.utils.builders import (add_metrics_to_eval_loaders,
-                                       build_composer_model, build_evaluators,
-                                       build_logger, build_tokenizer)
+                                       build_callback, build_composer_model,
+                                       build_evaluators, build_logger,
+                                       build_tokenizer)
 from llmfoundry.utils.config_utils import (log_config, pop_config,
                                            process_init_device)
 from llmfoundry.utils.registry_utils import import_file
@@ -49,6 +51,7 @@ def evaluate_model(
     eval_gauntlet_df: Optional[pd.DataFrame],
     eval_subset_num_batches: int,
     icl_subset_num_batches: Optional[int],
+    callback_configs: Optional[DictConfig],
     metadata: Optional[Dict[str, str]],
     logged_config: DictConfig,
     should_log_config: bool = True,
@@ -73,7 +76,12 @@ def evaluate_model(
         icl_subset_num_batches=icl_subset_num_batches,
     )
 
-    callbacks = []
+    # Callbacks
+    callbacks: List[Callback] = [
+        build_callback(str(name), callback_cfg)
+        for name, callback_cfg in callback_configs.items()
+    ] if callback_configs else []
+
     if eval_gauntlet_callback is not None:
         callbacks.append(eval_gauntlet_callback)
 
@@ -238,6 +246,10 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
 
     # Pop out interpolation variables.
     pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)
+    callback_configs: Optional[DictConfig] = pop_config(cfg,
+                                                        'callbacks',
+                                                        must_exist=False,
+                                                        default_value=None)
 
     # Warn for unused parameters
     for key in cfg:
@@ -296,6 +308,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
              python_log_level=python_log_level,
              precision=precision,
              eval_gauntlet_df=eval_gauntlet_df,
+             callback_configs=callback_configs,
              eval_subset_num_batches=eval_subset_num_batches,
              icl_subset_num_batches=icl_subset_num_batches,
              metadata=metadata,