Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable gauntlet training #501

Merged
merged 79 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from 77 commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
7e8511b
add subset num batches
bmosaicml Jul 14, 2023
059c43e
add subset num batches
bmosaicml Jul 14, 2023
75c455c
remove tiktoken
bmosaicml Jul 16, 2023
f028ad8
remove openai import
bmosaicml Jul 17, 2023
06fa54a
remove bad line
bmosaicml Jul 17, 2023
3a139b2
foo
bmosaicml Jul 17, 2023
56a2c88
add training callback
bmosaicml Aug 1, 2023
e16e86b
modify yamls
bmosaicml Aug 1, 2023
8341a76
implement train
bmosaicml Aug 1, 2023
6ff5cc5
fix indexing to get most recent eval result
bmosaicml Aug 1, 2023
06560d5
finish
bmosaicml Aug 2, 2023
9e07ece
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 2, 2023
989f61a
finish
bmosaicml Aug 2, 2023
4c316f1
finish
bmosaicml Aug 2, 2023
7de1b8c
finish
bmosaicml Aug 2, 2023
8a77e88
finish
bmosaicml Aug 2, 2023
61d682a
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 9, 2023
6b2116d
foo
bmosaicml Aug 9, 2023
33d3165
foo
bmosaicml Aug 9, 2023
85c2641
working on debugging changeS
bmosaicml Aug 12, 2023
1b3944f
[wip] removing logger dependency from model gauntlet
bmosaicml Aug 14, 2023
309570d
remove logger from eval
bmosaicml Aug 14, 2023
850bc8e
remove logger from eval
bmosaicml Aug 14, 2023
82cee97
remove logger from eval
bmosaicml Aug 14, 2023
fe2c141
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 15, 2023
df170de
debug
bmosaicml Aug 16, 2023
c20ee09
debug
bmosaicml Aug 16, 2023
f23a1ad
debug
bmosaicml Aug 16, 2023
7865e83
debug
bmosaicml Aug 16, 2023
96210f0
fix
bmosaicml Aug 16, 2023
669a770
finish?
bmosaicml Aug 16, 2023
b269552
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 16, 2023
6819b43
fix bug
bmosaicml Aug 16, 2023
03f80d9
merge main
bmosaicml Aug 16, 2023
a4f981a
fix bug
bmosaicml Aug 16, 2023
89d9e6a
minor nits
bmosaicml Aug 16, 2023
3c7b4f5
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 17, 2023
c0b434f
add unit test
bmosaicml Aug 17, 2023
d7662d0
add unit test
bmosaicml Aug 17, 2023
a538154
pyright
bmosaicml Aug 17, 2023
d7c6fdf
Merge branch 'main' into enable_gauntlet_training
codestar12 Aug 18, 2023
43491ca
fixing unit test [wip]
bmosaicml Aug 18, 2023
f8986c6
trying to unit test :(
bmosaicml Aug 21, 2023
4a3475a
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 21, 2023
0df68be
trying to unit test :(
bmosaicml Aug 21, 2023
057204c
Merge branch 'enable_gauntlet_training' of github.com:mosaicml/llm-fo…
bmosaicml Aug 21, 2023
ffa2f45
test not added on github
bmosaicml Aug 21, 2023
5a27871
fix all errors
bmosaicml Aug 21, 2023
427d3af
lint
bmosaicml Aug 22, 2023
0be1ef8
ignore pyright
bmosaicml Aug 22, 2023
d799893
change yaml
bmosaicml Aug 22, 2023
8751480
fix testing
bmosaicml Aug 22, 2023
28bbd08
remove unnecessary line
bmosaicml Aug 22, 2023
3c51e15
fix typing in test
bmosaicml Aug 23, 2023
9c360a1
fix typing
bmosaicml Aug 23, 2023
065a156
fix typing
bmosaicml Aug 23, 2023
247398e
ignore pyright
bmosaicml Aug 23, 2023
c31e405
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 23, 2023
8ee86c4
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 24, 2023
f57364a
done :)
bmosaicml Aug 24, 2023
6827c4f
change name to eval gauntlet
bmosaicml Aug 25, 2023
8140729
merge
bmosaicml Aug 25, 2023
cffbacb
merge main
bmosaicml Aug 25, 2023
c23a2c0
merge main
bmosaicml Aug 25, 2023
e02985f
no mas
bmosaicml Aug 25, 2023
88f8f6f
fix broken test
bmosaicml Aug 25, 2023
3432774
shrink test
bmosaicml Aug 28, 2023
42633ff
try commenting out test_train
bmosaicml Aug 28, 2023
06a10e8
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 28, 2023
44a5010
rm redundant test
bmosaicml Aug 28, 2023
eb8969c
Merge branch 'enable_gauntlet_training' of github.com:mosaicml/llm-fo…
bmosaicml Aug 28, 2023
efebf26
remove broken test
bmosaicml Aug 28, 2023
053b9e0
trying to fix test
bmosaicml Aug 28, 2023
66e63af
done :))))))
bmosaicml Aug 28, 2023
acf3d9e
reduce training to 1ba
bmosaicml Aug 28, 2023
de13800
Merge branch 'main' into enable_gauntlet_training
bmosaicml Aug 28, 2023
19e336e
address daniel's concerns about back compat
bmosaicml Aug 29, 2023
791611f
address daniel's concerns about back compat
bmosaicml Aug 29, 2023
65e6666
fix license
bmosaicml Aug 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llmfoundry/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0

try:
from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet
from llmfoundry.callbacks.fdiff_callback import FDiffMetrics
from llmfoundry.callbacks.generate_callback import Generate
from llmfoundry.callbacks.model_gauntlet_callback import ModelGauntlet
from llmfoundry.callbacks.monolithic_ckpt_callback import \
MonolithicCheckpointSaver
from llmfoundry.callbacks.resumption_callbacks import (GlobalLRScaling,
Expand All @@ -23,5 +23,5 @@
'GlobalLRScaling',
'LayerFreezing',
'ScheduledGarbageCollector',
'ModelGauntlet',
'EvalGauntlet',
]
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
"""Aggregate ICL evals into composite scores."""

import math
import re
from enum import Enum
from typing import Optional

from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['ModelGauntlet']
__all__ = ['EvalGauntlet']


class Weighting(Enum):
Expand All @@ -21,17 +20,32 @@ class Weighting(Enum):


class ModelGauntlet(Callback):
bmosaicml marked this conversation as resolved.
Show resolved Hide resolved
"""The ModelGauntlet aggregates ICL eval results.
"""The ModelGauntlet callback has been renamed to EvalGauntlet.

We've created this dummy class, in order to alert anyone who may have been
importing ModelGauntlet.
"""

def __init__(
self,
*args, # pyright: ignore [reportMissingParameterType]
**kwargs): # pyright: ignore [reportMissingParameterType]
raise ImportError(
'ModelGauntlet class is deprecated, please use EvalGauntlet')


class EvalGauntlet(Callback):
"""The EvalGauntlet aggregates ICL eval results.

After `eval_end`, this callback inspects the logger for different ICL metrics and aggregates the scores according to the aggregation
specification provided in the constructor.

Args:
logger_keys (dict): These are the exact keys that the individual benchmark metrics will be
logger_keys (list): These are the exact keys that the individual benchmark metrics will be
logged under in the logger after eval
tasks (dict): This contains the list of categories, as well as the subtasks within them, the
random baseline accuracy of each subtask, and the number of fewshot examples
used for the task. See `llmfoundry/scripts/eval/yamls/model_gauntlet.yaml` to see the structure.
used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
weighting (Weighting): The weighting scheme used to balance different tasks within each category.
Either assign them all equal weight, assign them weight proportional
to the dataset size, or assign them weight proportional to the log2 of the dataset size.
Expand All @@ -44,12 +58,15 @@ class ModelGauntlet(Callback):
"""

def __init__(self,
logger_keys: dict,
logger_keys: list,
categories: dict,
weighting: str = 'EQUAL',
subtract_random_baseline: bool = True,
rescale_accuracy: bool = True,
benchmark_sizes: Optional[dict] = None):
if isinstance(logger_keys, dict):
raise ValueError(
'logger_keys now requires a list type as input, not a dict')
if weighting != Weighting.EQUAL and benchmark_sizes is None:
raise Exception(
'When not using equal weighting, you must provide the benchmark sizes.'
Expand Down Expand Up @@ -90,56 +107,50 @@ def __init__(self,
assert weight is not None
benchmark['weighting'] = weight

def compute_averages(self, logger_data: Logger):

def compute_averages(self, state: State):
results = {}
pat = re.compile(
'metrics/(.*?)/(\d+)-shot(/.*?)?/InContextLearning(.*)' # type: ignore
)

for key in self.logger_keys:
match = pat.match(key)

# TODO(bmosaicml) This needs to be factored for this callback to work as a normal callback
# and therefore for the typing to be fixed
val = logger_data.data[key][0][1].item() # type: ignore

if match:
eval_name = match.group(1)
num_shot = match.group(2)
subcat = match.group(3)
metric = match.group(4)

if subcat is not None:
subcat = subcat[1:]
if f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}' not in results:
results[f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'] = []
results[
f'metrics/{eval_name}/{num_shot}-shot/InContextLearning{metric}'].append(
val)
else:
results[key] = [val]

# starting at index 1 skips the "metric" part of the key which is superfluous
dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1]
if 'Accuracy' not in metric_name:
continue

metric = state.eval_metrics.get('/'.join(dl_name),
{}).get(metric_name, None)
if metric is None:
continue
val = metric.compute().item()

# ending at index 2 allows us to aggregate over dataloaders w/ subcategories
key = '/'.join(dl_name[0:2])
if key not in results:
results[key] = []

results[key].append(val)

return {k: sum(v) / len(v) for k, v in results.items()}

def eval_end(self, state: State, logger: Logger):
new_metrics = self.compute_averages(logger)
def eval_after_all(self, state: State, logger: Logger):
new_metrics = self.compute_averages(state)
if len(new_metrics) == 0:
return {}
composite_scores = {}

for category in self.categories:
missing_metrics = []
composite_scores[category['name']] = []
for benchmark in category['benchmarks']:
key_pat = re.compile(
f"metrics/{benchmark['name']}/{benchmark['num_fewshot']}-shot/.*Accuracy"
)
key = f"{benchmark['name']}/{benchmark['num_fewshot']}-shot"

matching_key = [
k for k in new_metrics.keys()
if key_pat.match(k) is not None
]
if len(matching_key) == 0:
if key not in new_metrics:
print(
f"Warning: couldn't find results for benchmark: {benchmark}"
)
missing_metrics.append(key)
else:
score = new_metrics[matching_key[0]]
score = new_metrics[key]

if self.subtract_random_baseline:
score -= benchmark['random_baseline']
Expand All @@ -152,19 +163,28 @@ def eval_end(self, state: State, logger: Logger):
'score': score,
'weighting': benchmark['weighting']
})

if len(missing_metrics) > 0:
print(
f"Removing category `{category['name']}` from gauntlet scores because benchmarks were missing: {missing_metrics}"
)
del composite_scores[category['name']]
continue
total_weight = sum(
k['weighting'] for k in composite_scores[category['name']])
composite_scores[category['name']] = sum(
k['score'] * (k['weighting'] / total_weight)
for k in composite_scores[category['name']])

composite_scores = {
f'metrics/model_gauntlet/{k}': v
f'icl/metrics/eval_gauntlet/{k}': v
for k, v in composite_scores.items()
}

composite_scores['metrics/model_gauntlet/average'] = sum(
composite_scores.values()) / len(composite_scores.values())
logger.log_metrics(composite_scores)
composite_scores['icl/metrics/eval_gauntlet/average'] = sum(
composite_scores.values()) / len(composite_scores.values()) if len(
composite_scores.values()) > 0 else 0
if logger is not None:
logger.log_metrics(composite_scores)

return composite_scores
64 changes: 54 additions & 10 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import os
from typing import Any, Dict, Optional, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
from composer import algorithms
Expand All @@ -12,7 +12,8 @@
from composer.core import Evaluator
from composer.datasets.in_context_learning_evaluation import \
get_icl_task_dataloader
from composer.loggers import MLFlowLogger, TensorboardLogger, WandBLogger
from composer.loggers import (InMemoryLogger, MLFlowLogger, TensorboardLogger,
WandBLogger)
from composer.optim import DecoupledAdamW
from composer.optim.scheduler import (ConstantWithWarmupScheduler,
CosineAnnealingWithWarmupScheduler,
Expand All @@ -22,13 +23,48 @@
from omegaconf import OmegaConf as om
from transformers import AutoTokenizer, PreTrainedTokenizerBase

from llmfoundry.callbacks import (FDiffMetrics, Generate, GlobalLRScaling,
LayerFreezing, MonolithicCheckpointSaver,
from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, Generate,
GlobalLRScaling, LayerFreezing,
MonolithicCheckpointSaver,
ScheduledGarbageCollector)
from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
DecoupledLionW, DecoupledLionW_8bit)


def build_icl_data_and_gauntlet(
icl_tasks_config: Union[str, ListConfig],
eval_gauntlet_config: Optional[Union[str, DictConfig]],
tokenizer: AutoTokenizer,
device_eval_batch_size: int,
icl_seq_len: int,
icl_subset_num_batches: Optional[int] = None
) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:
icl_evaluators, logger_keys = build_icl_evaluators(
icl_tasks_config,
tokenizer,
icl_seq_len,
device_eval_batch_size,
icl_subset_num_batches=icl_subset_num_batches)
eval_gauntlet_cb = None
if eval_gauntlet_config is not None:
if isinstance(eval_gauntlet_config, str):
with open(eval_gauntlet_config, 'r') as icl_f:
eval_gauntlet_cfg = om.load(icl_f)
eval_gauntlet = eval_gauntlet_cfg.eval_gauntlet
elif isinstance(eval_gauntlet_config, DictConfig): # pyright: ignore
eval_gauntlet = eval_gauntlet_config
else:
raise ValueError(
f'Got invalid type for eval_gauntlet_config: {type(eval_gauntlet_config)}'
)
eval_gauntlet.logger_keys = logger_keys
eval_gauntlet.benchmark_sizes = {
e.label: e.dataloader.num_samples for e in icl_evaluators
}
eval_gauntlet_cb = EvalGauntlet(**eval_gauntlet)
return icl_evaluators, logger_keys, eval_gauntlet_cb


def build_callback(name: str, kwargs: Dict[str, Any]):
if name == 'lr_monitor':
return LRMonitor()
Expand Down Expand Up @@ -69,6 +105,8 @@ def build_logger(name: str, kwargs: Dict[str, Any]):
return TensorboardLogger(**kwargs)
elif name == 'mlflow':
return MLFlowLogger(**kwargs)
elif name == 'inmemory':
return InMemoryLogger(**kwargs)
else:
raise ValueError(f'Not sure how to build logger: {name}')

Expand Down Expand Up @@ -138,13 +176,17 @@ def build_tokenizer(om_tokenizer_config: DictConfig) -> PreTrainedTokenizerBase:
return tokenizer


def build_icl_evaluators(icl_tasks: Union[str, ListConfig],
tokenizer: PreTrainedTokenizerBase,
default_max_seq_len: int,
default_batch_size: int,
destination_dir: Optional[str] = None):
def build_icl_evaluators(
icl_tasks: Union[str, ListConfig],
tokenizer: AutoTokenizer,
default_max_seq_len: int,
default_batch_size: int,
destination_dir: Optional[str] = None,
icl_subset_num_batches: Optional[int] = None,
):
if destination_dir is None:
destination_dir = os.getcwd()

evaluators = []
logger_keys = []

Expand Down Expand Up @@ -193,6 +235,7 @@ def _validate_cfg(icl_cfg: DictConfig):
icl_cfg.batch_size = default_batch_size

for icl_cfg in icl_tasks_list:
assert isinstance(icl_cfg, DictConfig)
_validate_cfg(icl_cfg)
for num_fewshot in list(icl_cfg.num_fewshot):
if tokenizer.pad_token_id is None:
Expand Down Expand Up @@ -240,6 +283,7 @@ def _validate_cfg(icl_cfg: DictConfig):
evaluators.append(
Evaluator(label=label,
dataloader=dataloaders,
metric_names=metric_names),)
metric_names=metric_names,
subset_num_batches=icl_subset_num_batches))

return evaluators, logger_keys
Loading
Loading