Skip to content

Commit

Permalink
Merge branch 'main' into wip
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Apr 23, 2024
2 parents 5209049 + c53622e commit e432090
Show file tree
Hide file tree
Showing 111 changed files with 7,273 additions and 488 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
my-copy-c4*/
my-copy-arxiv*/
*.jsonl*
!tests/eval/local_data/*.jsonl

# WandB
wandb/
Expand Down
56 changes: 23 additions & 33 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,49 +19,39 @@

hf_dynamic_modules_logger.addFilter(new_files_warning_filter)

from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
Seq2SeqFinetuningCollator,
build_finetuning_dataloader)
from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, scaled_multihead_dot_product_attention)
from llmfoundry.models.layers.blocks import MPTBlock
from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
from llmfoundry import (algorithms, callbacks, cli, data, eval, interfaces,
loggers, metrics, models, optim, tokenizers, utils)
from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
from llmfoundry.models.hf import ComposerHFCausalLM
from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
MPTForCausalLM, MPTModel, MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper
from llmfoundry.optim import DecoupledLionW

__all__ = [
'build_finetuning_dataloader',
'Seq2SeqFinetuningCollator',
'MPTBlock',
'FFN_CLASS_REGISTRY',
'MPTMLP',
'build_ffn',
'StreamingFinetuningDataset',
'StreamingTextDataset',
'InContextLearningDataset',
'InContextLearningMetric',
'ComposerHFCausalLM',
'MPTConfig',
'MPTPreTrainedModel',
'MPTModel',
'MPTForCausalLM',
'ComposerMPTCausalLM',
'ComposerHFCausalLM',
'ComposerHFT5',
'scaled_multihead_dot_product_attention',
'flash_attn_fn',
'MultiheadAttention',
'NoConcatDataset',
'ConcatTokensDataset',
'attn_bias_shape',
'build_attn_bias',
'build_alibi_bias',
'optim',
'utils',
'loggers',
'DecoupledLionW',
'algorithms',
'callbacks',
'TiktokenTokenizerWrapper',
'registry',
'cli',
'data',
'eval',
'interfaces',
'loggers',
'metrics',
'models',
'optim',
'tokenizers',
'utils',
]

__version__ = '0.7.0'
__version__ = '0.8.0.dev0'
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/async_eval_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

log = logging.getLogger(__name__)

__all__ = ['AsyncEval']

REQUIRED_PARAMS_FOR_EVAL = {
'device_eval_batch_size',
'icl_tasks', # only required for eval, may not be specified in pure training
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/curriculum_learning_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

log = logging.getLogger(__name__)

__all__ = ['CurriculumLearning']


@experimental_class('CurriculumLearning callback')
class CurriculumLearning(CallbackWithConfig):
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/fdiff_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['FDiffMetrics']


class FDiffMetrics(Callback):
"""Rate of change of metrics.
Expand Down
55 changes: 43 additions & 12 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import time
from multiprocessing.context import SpawnProcess
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Union
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
import torch.nn as nn
from composer.core import Callback, Event, State, Time, TimeUnit
Expand All @@ -35,6 +36,8 @@

log = logging.getLogger(__name__)

__all__ = ['HuggingFaceCheckpointer']

_LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE)


Expand Down Expand Up @@ -158,8 +161,6 @@ def __init__(
if mlflow_logging_config is None:
mlflow_logging_config = {}
if self.mlflow_registered_model_name is not None:
import numpy as np

# Both the metadata and the task are needed in order for mlflow
# and databricks optimized model serving to work
passed_metadata = mlflow_logging_config.get('metadata', {})
Expand All @@ -169,18 +170,17 @@ def __init__(
default_input_example = {
'prompt': np.array(['What is Machine Learning?'])
}
is_chat = mlflow_logging_config['task'].endswith(
'chat') or mlflow_logging_config['metadata'].get(
'task', '').endswith('chat')
is_chat = mlflow_logging_config['task'].endswith('chat') or (
mlflow_logging_config['metadata'] is not None and
mlflow_logging_config['metadata'].get('task',
'').endswith('chat'))
if is_chat:
default_input_example = {
'messages':
np.array([{
'role': 'user',
'content': 'What is Machine Learning?'
}])
'messages': [{
'role': 'user',
'content': 'What is Machine Learning?'
}]
}
mlflow_logging_config.setdefault('example_no_conversion', True)
mlflow_logging_config.setdefault('input_example',
default_input_example)

Expand Down Expand Up @@ -258,6 +258,16 @@ def _is_last_batch(self, state: State):
return True

assert state.max_duration is not None # for pyright

epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
state.timestamp.epoch == state.max_duration.value - 1)
# If the save interval is specified as exactly the same number of batches as the total duration,
# but the max duration is specified in epochs, we need a special case to identify we are on the last batch
# and should write the mlflow checkpoint. This should occur on the last batch of the final epoch.
if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
return True

# If the save interval is specified as 1dur, and the max duration is in epoch units
# we need a special case to identify we are on the last batch and should write the mlflow checkpoint
if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
Expand All @@ -273,6 +283,23 @@ def _all_child_processes_done(self) -> bool:
dist.all_reduce(x, reduce_operation='MAX')
return x.item() == 0

def transform_model_and_tokenizer(
self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase
) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
"""Transform the model and tokenizer before saving.
This allows a subclass to modify the model and tokenizer before saving. The base class implementation will
make no modifications.
Args:
model (PreTrainedModel): The model to be transformed.
tokenizer (PreTrainedTokenizerBase): The tokenizer to be transformed.
Returns:
Tuple[PreTrainedModel, PreTrainedTokenizerBase]: The transformed model and tokenizer.
"""
return model, tokenizer

def _save_checkpoint(self, state: State, logger: Logger):
del logger # unused

Expand Down Expand Up @@ -405,6 +432,10 @@ def dtensor_to_tensor_hook(
new_model_instance.load_state_dict(state_dict, assign=True)
del state_dict

# Transform the model and tokenizer before saving
new_model_instance, original_tokenizer = self.transform_model_and_tokenizer(
new_model_instance, original_tokenizer)

log.debug('Saving Hugging Face checkpoint to disk')
new_model_instance.save_pretrained(temp_save_dir)
if original_tokenizer is not None:
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from composer.loggers import Logger
from composer.utils import dist

__all__ = ['MegaBlocksMoE_TokPerExpert']


class MegaBlocksMoE_TokPerExpert(Callback):
"""Log tokens per expert for MegaBlocks MoE.
Expand Down Expand Up @@ -44,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback):
Args:
log_interval (int, optional): The interval on which to log (Default: 10).
log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log
log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log
only aggregate statistics (Default: False).
all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log
statistics for GPU 0 (Default: False).
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/monolithic_ckpt_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
reproducibility)

__all__ = ['MonolithicCheckpointSaver']


class MonolithicCheckpointSaver(Callback):
"""Save a monolithic checkpoint every N batches.
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/callbacks/resumption_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

from llmfoundry.utils.warnings import experimental_class

__all__ = [
'GlobalLRScaling',
'LayerFreezing',
Expand All @@ -15,6 +17,7 @@
log = logging.getLogger(__name__)


@experimental_class('GlobalLRScaling')
class GlobalLRScaling(Callback):
"""GlobalLRScaling.
Expand Down Expand Up @@ -52,6 +55,7 @@ def fit_start(self, state: State, logger: Logger) -> None:
]


@experimental_class('LayerFreezing')
class LayerFreezing(Callback):
"""LayerFreezing.
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/callbacks/scheduled_gc_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from composer.core import Callback, State
from composer.loggers import Logger

__all__ = ['ScheduledGarbageCollector']


def gc_cuda():
"""Garbage collect Torch (CUDA) memory."""
Expand Down
15 changes: 13 additions & 2 deletions llmfoundry/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
from llmfoundry.data.dataloader import build_dataloader
from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
StreamingFinetuningDataset,
build_finetuning_dataloader)
from llmfoundry.data.text_data import (StreamingTextDataset,
build_text_dataloader)
from llmfoundry.data.packing import (BinPackCollator, auto_packing_ratio,
profile_packing)
from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
StreamingTextDataset,
build_text_dataloader,
get_tokens_per_batch_func)
from llmfoundry.registry import dataloaders

dataloaders.register('text', func=build_text_dataloader)
Expand All @@ -15,9 +20,15 @@
__all__ = [
'Seq2SeqFinetuningCollator',
'build_finetuning_dataloader',
'StreamingFinetuningDataset',
'StreamingTextDataset',
'build_text_dataloader',
'NoConcatDataset',
'ConcatTokensDataset',
'build_dataloader',
'BinPackCollator',
'auto_packing_ratio',
'profile_packing',
'ConcatenatedSequenceCollatorWrapper',
'get_tokens_per_batch_func',
]
5 changes: 5 additions & 0 deletions llmfoundry/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from torch.utils.data import IterableDataset
from transformers import PreTrainedTokenizerBase

__all__ = [
'ConcatTokensDataset',
'NoConcatDataset',
]


class NoConcatDataset(IterableDataset):
"""An IterableDataset that returns text samples for MDSWriter.
Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from llmfoundry import registry
from llmfoundry.utils.registry_utils import construct_from_registry

__all__ = [
'build_dataloader',
]


def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
device_batch_size: int) -> DataSpec:
Expand Down
13 changes: 12 additions & 1 deletion llmfoundry/data/finetuning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,16 @@

from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
from llmfoundry.data.finetuning.tasks import (StreamingFinetuningDataset,
dataset_constructor,
is_valid_ift_example,
tokenize_formatted_example)

__all__ = ['Seq2SeqFinetuningCollator', 'build_finetuning_dataloader']
__all__ = [
'Seq2SeqFinetuningCollator',
'build_finetuning_dataloader',
'dataset_constructor',
'tokenize_formatted_example',
'is_valid_ift_example',
'StreamingFinetuningDataset',
]
4 changes: 4 additions & 0 deletions llmfoundry/data/finetuning/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

log = logging.getLogger(__name__)

__all__ = [
'Seq2SeqFinetuningCollator',
]

# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100

Expand Down
4 changes: 4 additions & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@

log = logging.getLogger(__name__)

__all__ = [
'build_finetuning_dataloader',
]

# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100

Expand Down
Loading

0 comments on commit e432090

Please sign in to comment.