Skip to content

Commit

Permalink
Merge branch 'main' into milo/foundry-type-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Apr 23, 2024
2 parents 853c173 + 0d62e61 commit 22bbccc
Show file tree
Hide file tree
Showing 75 changed files with 449 additions and 146 deletions.
52 changes: 22 additions & 30 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,47 +19,39 @@

hf_dynamic_modules_logger.addFilter(new_files_warning_filter)

from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
Seq2SeqFinetuningCollator,
build_finetuning_dataloader)
from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, scaled_multihead_dot_product_attention)
from llmfoundry.models.layers.blocks import MPTBlock
from llmfoundry.models.layers.ffn import MPTMLP
from llmfoundry import (algorithms, callbacks, cli, data, eval, interfaces,
loggers, metrics, models, optim, tokenizers, utils)
from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
from llmfoundry.models.hf import ComposerHFCausalLM
from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
MPTForCausalLM, MPTModel, MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper
from llmfoundry.optim import DecoupledLionW

__all__ = [
'build_finetuning_dataloader',
'Seq2SeqFinetuningCollator',
'MPTBlock',
'MPTMLP',
'StreamingFinetuningDataset',
'StreamingTextDataset',
'InContextLearningDataset',
'InContextLearningMetric',
'ComposerHFCausalLM',
'MPTConfig',
'MPTPreTrainedModel',
'MPTModel',
'MPTForCausalLM',
'ComposerMPTCausalLM',
'ComposerHFCausalLM',
'ComposerHFT5',
'scaled_multihead_dot_product_attention',
'flash_attn_fn',
'MultiheadAttention',
'NoConcatDataset',
'ConcatTokensDataset',
'attn_bias_shape',
'build_attn_bias',
'build_alibi_bias',
'optim',
'utils',
'loggers',
'DecoupledLionW',
'algorithms',
'callbacks',
'TiktokenTokenizerWrapper',
'registry',
'cli',
'data',
'eval',
'interfaces',
'loggers',
'metrics',
'models',
'optim',
'tokenizers',
'utils',
]

__version__ = '0.8.0.dev0'
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/async_eval_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

log = logging.getLogger(__name__)

__all__ = ['AsyncEval']

REQUIRED_PARAMS_FOR_EVAL = {
'device_eval_batch_size',
'icl_tasks', # only required for eval, may not be specified in pure training
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/curriculum_learning_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

log = logging.getLogger(__name__)

__all__ = ['CurriculumLearning']


@experimental_class('CurriculumLearning callback')
class CurriculumLearning(CallbackWithConfig):
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/fdiff_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['FDiffMetrics']


class FDiffMetrics(Callback):
"""Rate of change of metrics.
Expand Down
32 changes: 21 additions & 11 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
from composer.core import Callback, Event, State, Time, TimeUnit
Expand All @@ -35,6 +36,8 @@

log = logging.getLogger(__name__)

__all__ = ['HuggingFaceCheckpointer']

_LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE)


Expand Down Expand Up @@ -158,8 +161,6 @@ def __init__(
if mlflow_logging_config is None:
mlflow_logging_config = {}
if self.mlflow_registered_model_name is not None:
import numpy as np

# Both the metadata and the task are needed in order for mlflow
# and databricks optimized model serving to work
passed_metadata = mlflow_logging_config.get('metadata', {})
Expand All @@ -169,18 +170,17 @@ def __init__(
default_input_example = {
'prompt': np.array(['What is Machine Learning?'])
}
is_chat = mlflow_logging_config['task'].endswith(
'chat') or mlflow_logging_config['metadata'].get(
'task', '').endswith('chat')
is_chat = mlflow_logging_config['task'].endswith('chat') or (
mlflow_logging_config['metadata'] is not None and
mlflow_logging_config['metadata'].get('task',
'').endswith('chat'))
if is_chat:
default_input_example = {
'messages':
np.array([{
'role': 'user',
'content': 'What is Machine Learning?'
}])
'messages': [{
'role': 'user',
'content': 'What is Machine Learning?'
}]
}
mlflow_logging_config.setdefault('example_no_conversion', True)
mlflow_logging_config.setdefault('input_example',
default_input_example)

Expand Down Expand Up @@ -258,6 +258,16 @@ def _is_last_batch(self, state: State):
return True

assert state.max_duration is not None # for pyright

epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
state.timestamp.epoch == state.max_duration.value - 1)
# If the save interval is specified as exactly the same number of batches as the total duration,
# but the max duration is specified in epochs, we need a special case to identify we are on the last batch
# and should write the mlflow checkpoint. This should occur on the last batch of the final epoch.
if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
return True

# If the save interval is specified as 1dur, and the max duration is in epoch units
# we need a special case to identify we are on the last batch and should write the mlflow checkpoint
if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from composer.loggers import Logger
from composer.utils import dist

__all__ = ['MegaBlocksMoE_TokPerExpert']


class MegaBlocksMoE_TokPerExpert(Callback):
"""Log tokens per expert for MegaBlocks MoE.
Expand Down Expand Up @@ -44,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback):
Args:
log_interval (int, optional): The interval on which to log (Default: 10).
log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log
log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log
only aggregate statistics (Default: False).
all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log
statistics for GPU 0 (Default: False).
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/monolithic_ckpt_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
reproducibility)

__all__ = ['MonolithicCheckpointSaver']


class MonolithicCheckpointSaver(Callback):
"""Save a monolithic checkpoint every N batches.
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/callbacks/resumption_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

from llmfoundry.utils.warnings import experimental_class

__all__ = [
'GlobalLRScaling',
'LayerFreezing',
Expand All @@ -15,6 +17,7 @@
log = logging.getLogger(__name__)


@experimental_class('GlobalLRScaling')
class GlobalLRScaling(Callback):
"""GlobalLRScaling.
Expand Down Expand Up @@ -52,6 +55,7 @@ def fit_start(self, state: State, logger: Logger) -> None:
]


@experimental_class('LayerFreezing')
class LayerFreezing(Callback):
"""LayerFreezing.
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/scheduled_gc_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['ScheduledGarbageCollector']


def gc_cuda():
"""Garbage collect Torch (CUDA) memory."""
Expand Down
15 changes: 13 additions & 2 deletions llmfoundry/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
from llmfoundry.data.dataloader import build_dataloader
from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
StreamingFinetuningDataset,
build_finetuning_dataloader)
from llmfoundry.data.text_data import (StreamingTextDataset,
build_text_dataloader)
from llmfoundry.data.packing import (BinPackCollator, auto_packing_ratio,
profile_packing)
from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
StreamingTextDataset,
build_text_dataloader,
get_tokens_per_batch_func)
from llmfoundry.registry import dataloaders

dataloaders.register('text', func=build_text_dataloader)
Expand All @@ -15,9 +20,15 @@
__all__ = [
'Seq2SeqFinetuningCollator',
'build_finetuning_dataloader',
'StreamingFinetuningDataset',
'StreamingTextDataset',
'build_text_dataloader',
'NoConcatDataset',
'ConcatTokensDataset',
'build_dataloader',
'BinPackCollator',
'auto_packing_ratio',
'profile_packing',
'ConcatenatedSequenceCollatorWrapper',
'get_tokens_per_batch_func',
]
5 changes: 5 additions & 0 deletions llmfoundry/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from torch.utils.data import IterableDataset
from transformers import PreTrainedTokenizerBase

__all__ = [
'ConcatTokensDataset',
'NoConcatDataset',
]


class NoConcatDataset(IterableDataset):
"""An IterableDataset that returns text samples for MDSWriter.
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from llmfoundry import registry
from llmfoundry.utils.registry_utils import construct_from_registry

__all__ = [
'build_dataloader',
]


def build_dataloader(cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase,
device_batch_size: int) -> DataSpec:
Expand Down
13 changes: 12 additions & 1 deletion llmfoundry/data/finetuning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,16 @@

from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
from llmfoundry.data.finetuning.tasks import (StreamingFinetuningDataset,
dataset_constructor,
is_valid_ift_example,
tokenize_formatted_example)

__all__ = ['Seq2SeqFinetuningCollator', 'build_finetuning_dataloader']
__all__ = [
'Seq2SeqFinetuningCollator',
'build_finetuning_dataloader',
'dataset_constructor',
'tokenize_formatted_example',
'is_valid_ift_example',
'StreamingFinetuningDataset',
]
4 changes: 4 additions & 0 deletions llmfoundry/data/finetuning/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

log = logging.getLogger(__name__)

__all__ = [
'Seq2SeqFinetuningCollator',
]

# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100

Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

log = logging.getLogger(__name__)

__all__ = [
'build_finetuning_dataloader',
]

# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100

Expand Down
7 changes: 6 additions & 1 deletion llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:

log = logging.getLogger(__name__)

__all__ = ['dataset_constructor']
__all__ = [
'dataset_constructor',
'tokenize_formatted_example',
'is_valid_ift_example',
'StreamingFinetuningDataset',
]

_ALLOWED_RESPONSE_KEYS = {'response', 'completion'}
_ALLOWED_PROMPT_KEYS = {'prompt'}
Expand Down
6 changes: 6 additions & 0 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@

log = logging.getLogger(__name__)

__all__ = [
'BinPackCollator',
'auto_packing_ratio',
'profile_packing',
]


class BinPackCollator:
"""Utility collator for packing to reduce padding."""
Expand Down
7 changes: 7 additions & 0 deletions llmfoundry/data/text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@

log = logging.getLogger(__name__)

__all__ = [
'StreamingTextDataset',
'build_text_dataloader',
'ConcatenatedSequenceCollatorWrapper',
'get_tokens_per_batch_func',
]


class StreamingTextDataset(StreamingDataset):
"""Generic text dataset using MosaicML's StreamingDataset.
Expand Down
29 changes: 29 additions & 0 deletions llmfoundry/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,31 @@
# Copyright 2024 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from llmfoundry.eval.datasets.in_context_learning_evaluation import (
InContextLearningCodeEvalDataset, InContextLearningDataset,
InContextLearningGenerationTaskWithAnswersDataset,
InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
from llmfoundry.eval.metrics.nlp import (
InContextLearningCodeEvalAccuracy,
InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
InContextLearningLMExpectedCalibrationError,
InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
InContextLearningMultipleChoiceAccuracy)

__all__ = [
'InContextLearningDataset',
'InContextLearningLMTaskDataset',
'InContextLearningMultipleChoiceTaskDataset',
'InContextLearningSchemaTaskDataset',
'InContextLearningCodeEvalDataset',
'InContextLearningGenerationTaskWithAnswersDataset',
'get_icl_task_dataloader',
'InContextLearningMetric',
'InContextLearningLMAccuracy',
'InContextLearningMultipleChoiceAccuracy',
'InContextLearningGenerationExactMatchAccuracy',
'InContextLearningCodeEvalAccuracy',
'InContextLearningLMExpectedCalibrationError',
'InContextLearningMCExpectedCalibrationError',
]
Loading

0 comments on commit 22bbccc

Please sign in to comment.