Merge branch 'main' into wip

mosaicml · Apr 23, 2024 · e432090 · e432090
2 parents 5209049 + c53622e
commit e432090
Show file tree

Hide file tree

Showing 111 changed files with 7,273 additions and 488 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 my-copy-c4*/
 my-copy-arxiv*/
 *.jsonl*
+!tests/eval/local_data/*.jsonl
 
 # WandB
 wandb/

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -19,49 +19,39 @@
 
 hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
 
-from llmfoundry import algorithms, callbacks, loggers, optim, registry, utils
-from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
-                             Seq2SeqFinetuningCollator,
-                             build_finetuning_dataloader)
-from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
-from llmfoundry.models.layers.attention import (
-    MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
-    flash_attn_fn, scaled_multihead_dot_product_attention)
-from llmfoundry.models.layers.blocks import MPTBlock
-from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
+from llmfoundry import (algorithms, callbacks, cli, data, eval, interfaces,
+                        loggers, metrics, models, optim, tokenizers, utils)
+from llmfoundry.data import StreamingFinetuningDataset, StreamingTextDataset
+from llmfoundry.eval import InContextLearningDataset, InContextLearningMetric
+from llmfoundry.models.hf import ComposerHFCausalLM
 from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
                                    MPTForCausalLM, MPTModel, MPTPreTrainedModel)
-from llmfoundry.tokenizers import TiktokenTokenizerWrapper
+from llmfoundry.optim import DecoupledLionW
 
 __all__ = [
-    'build_finetuning_dataloader',
-    'Seq2SeqFinetuningCollator',
-    'MPTBlock',
-    'FFN_CLASS_REGISTRY',
-    'MPTMLP',
-    'build_ffn',
+    'StreamingFinetuningDataset',
+    'StreamingTextDataset',
+    'InContextLearningDataset',
+    'InContextLearningMetric',
+    'ComposerHFCausalLM',
     'MPTConfig',
     'MPTPreTrainedModel',
     'MPTModel',
     'MPTForCausalLM',
     'ComposerMPTCausalLM',
-    'ComposerHFCausalLM',
-    'ComposerHFT5',
-    'scaled_multihead_dot_product_attention',
-    'flash_attn_fn',
-    'MultiheadAttention',
-    'NoConcatDataset',
-    'ConcatTokensDataset',
-    'attn_bias_shape',
-    'build_attn_bias',
-    'build_alibi_bias',
-    'optim',
-    'utils',
-    'loggers',
+    'DecoupledLionW',
     'algorithms',
     'callbacks',
-    'TiktokenTokenizerWrapper',
-    'registry',
+    'cli',
+    'data',
+    'eval',
+    'interfaces',
+    'loggers',
+    'metrics',
+    'models',
+    'optim',
+    'tokenizers',
+    'utils',
 ]
 
-__version__ = '0.7.0'
+__version__ = '0.8.0.dev0'
diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py
@@ -27,6 +27,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['AsyncEval']
+
 REQUIRED_PARAMS_FOR_EVAL = {
     'device_eval_batch_size',
     'icl_tasks',  # only required for eval, may not be specified in pure training

diff --git a/llmfoundry/callbacks/curriculum_learning_callback.py b/llmfoundry/callbacks/curriculum_learning_callback.py
@@ -20,6 +20,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['CurriculumLearning']
+
 
 @experimental_class('CurriculumLearning callback')
 class CurriculumLearning(CallbackWithConfig):

diff --git a/llmfoundry/callbacks/fdiff_callback.py b/llmfoundry/callbacks/fdiff_callback.py
@@ -8,6 +8,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+__all__ = ['FDiffMetrics']
+
 
 class FDiffMetrics(Callback):
     """Rate of change of metrics.

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -12,8 +12,9 @@
 import time
 from multiprocessing.context import SpawnProcess
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 from composer.core import Callback, Event, State, Time, TimeUnit
@@ -35,6 +36,8 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = ['HuggingFaceCheckpointer']
+
 _LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE)
 
 
@@ -158,8 +161,6 @@ def __init__(
         if mlflow_logging_config is None:
             mlflow_logging_config = {}
         if self.mlflow_registered_model_name is not None:
-            import numpy as np
-
             # Both the metadata and the task are needed in order for mlflow
             # and databricks optimized model serving to work
             passed_metadata = mlflow_logging_config.get('metadata', {})
@@ -169,18 +170,17 @@ def __init__(
             default_input_example = {
                 'prompt': np.array(['What is Machine Learning?'])
             }
-            is_chat = mlflow_logging_config['task'].endswith(
-                'chat') or mlflow_logging_config['metadata'].get(
-                    'task', '').endswith('chat')
+            is_chat = mlflow_logging_config['task'].endswith('chat') or (
+                mlflow_logging_config['metadata'] is not None and
+                mlflow_logging_config['metadata'].get('task',
+                                                      '').endswith('chat'))
             if is_chat:
                 default_input_example = {
-                    'messages':
-                        np.array([{
-                            'role': 'user',
-                            'content': 'What is Machine Learning?'
-                        }])
+                    'messages': [{
+                        'role': 'user',
+                        'content': 'What is Machine Learning?'
+                    }]
                 }
-                mlflow_logging_config.setdefault('example_no_conversion', True)
             mlflow_logging_config.setdefault('input_example',
                                              default_input_example)
 
@@ -258,6 +258,16 @@ def _is_last_batch(self, state: State):
             return True
 
         assert state.max_duration is not None  # for pyright
+
+        epoch_complete = state.dataloader_len == state.timestamp.batch_in_epoch
+        second_to_last_epoch = state.max_duration.unit == TimeUnit.EPOCH and (
+            state.timestamp.epoch == state.max_duration.value - 1)
+        # If the save interval is specified as exactly the same number of batches as the total duration,
+        # but the max duration is specified in epochs, we need a special case to identify we are on the last batch
+        # and should write the mlflow checkpoint. This should occur on the last batch of the final epoch.
+        if self.save_interval.unit == TimeUnit.BATCH and second_to_last_epoch and epoch_complete:
+            return True
+
         # If the save interval is specified as 1dur, and the max duration is in epoch units
         # we need a special case to identify we are on the last batch and should write the mlflow checkpoint
         if self.save_interval.unit == TimeUnit.DURATION and self.save_interval.value == 1 and state.max_duration.unit == TimeUnit.EPOCH:
@@ -273,6 +283,23 @@ def _all_child_processes_done(self) -> bool:
         dist.all_reduce(x, reduce_operation='MAX')
         return x.item() == 0
 
+    def transform_model_and_tokenizer(
+        self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase
+    ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
+        """Transform the model and tokenizer before saving.
+
+        This allows a subclass to modify the model and tokenizer before saving. The base class implementation will
+        make no modifications.
+
+        Args:
+            model (PreTrainedModel): The model to be transformed.
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be transformed.
+
+        Returns:
+            Tuple[PreTrainedModel, PreTrainedTokenizerBase]: The transformed model and tokenizer.
+        """
+        return model, tokenizer
+
     def _save_checkpoint(self, state: State, logger: Logger):
         del logger  # unused
 
@@ -405,6 +432,10 @@ def dtensor_to_tensor_hook(
             new_model_instance.load_state_dict(state_dict, assign=True)
             del state_dict
 
+            # Transform the model and tokenizer before saving
+            new_model_instance, original_tokenizer = self.transform_model_and_tokenizer(
+                new_model_instance, original_tokenizer)
+
             log.debug('Saving Hugging Face checkpoint to disk')
             new_model_instance.save_pretrained(temp_save_dir)
             if original_tokenizer is not None:

diff --git a/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py b/llmfoundry/callbacks/log_mbmoe_tok_per_expert_callback.py
@@ -9,6 +9,8 @@
 from composer.loggers import Logger
 from composer.utils import dist
 
+__all__ = ['MegaBlocksMoE_TokPerExpert']
+
 
 class MegaBlocksMoE_TokPerExpert(Callback):
     """Log tokens per expert for MegaBlocks MoE.
@@ -44,7 +46,7 @@ class MegaBlocksMoE_TokPerExpert(Callback):
 
     Args:
         log_interval (int, optional): The interval on which to log (Default: 10).
-        log_every_layer (bool, optional): Enable logging ever layer's statisictics (True) or log
+        log_every_layer (bool, optional): Enable logging ever layer's statistics (True) or log
             only aggregate statistics (Default: False).
         all_reduce_stats (bool, optional): Enable aggregating statistics across gpus (True) or log
             statistics for GPU 0 (Default: False).

diff --git a/llmfoundry/callbacks/monolithic_ckpt_callback.py b/llmfoundry/callbacks/monolithic_ckpt_callback.py
@@ -15,6 +15,8 @@
 from composer.utils import (dist, format_name_with_dist_and_time, parse_uri,
                             reproducibility)
 
+__all__ = ['MonolithicCheckpointSaver']
+
 
 class MonolithicCheckpointSaver(Callback):
     """Save a monolithic checkpoint every N batches.

diff --git a/llmfoundry/callbacks/resumption_callbacks.py b/llmfoundry/callbacks/resumption_callbacks.py
@@ -7,6 +7,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+from llmfoundry.utils.warnings import experimental_class
+
 __all__ = [
     'GlobalLRScaling',
     'LayerFreezing',
@@ -15,6 +17,7 @@
 log = logging.getLogger(__name__)
 
 
+@experimental_class('GlobalLRScaling')
 class GlobalLRScaling(Callback):
     """GlobalLRScaling.
 
@@ -52,6 +55,7 @@ def fit_start(self, state: State, logger: Logger) -> None:
             ]
 
 
+@experimental_class('LayerFreezing')
 class LayerFreezing(Callback):
     """LayerFreezing.
 

diff --git a/llmfoundry/callbacks/scheduled_gc_callback.py b/llmfoundry/callbacks/scheduled_gc_callback.py
@@ -8,6 +8,8 @@
 from composer.core import Callback, State
 from composer.loggers import Logger
 
+__all__ = ['ScheduledGarbageCollector']
+
 
 def gc_cuda():
     """Garbage collect Torch (CUDA) memory."""

diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
@@ -4,9 +4,14 @@
 from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
 from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.finetuning import (Seq2SeqFinetuningCollator,
+                                        StreamingFinetuningDataset,
                                         build_finetuning_dataloader)
-from llmfoundry.data.text_data import (StreamingTextDataset,
-                                       build_text_dataloader)
+from llmfoundry.data.packing import (BinPackCollator, auto_packing_ratio,
+                                     profile_packing)
+from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper,
+                                       StreamingTextDataset,
+                                       build_text_dataloader,
+                                       get_tokens_per_batch_func)
 from llmfoundry.registry import dataloaders
 
 dataloaders.register('text', func=build_text_dataloader)
@@ -15,9 +20,15 @@
 __all__ = [
     'Seq2SeqFinetuningCollator',
     'build_finetuning_dataloader',
+    'StreamingFinetuningDataset',
     'StreamingTextDataset',
     'build_text_dataloader',
     'NoConcatDataset',
     'ConcatTokensDataset',
     'build_dataloader',
+    'BinPackCollator',
+    'auto_packing_ratio',
+    'profile_packing',
+    'ConcatenatedSequenceCollatorWrapper',
+    'get_tokens_per_batch_func',
 ]
diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py
@@ -11,6 +11,11 @@
 from torch.utils.data import IterableDataset
 from transformers import PreTrainedTokenizerBase
 
+__all__ = [
+    'ConcatTokensDataset',
+    'NoConcatDataset',
+]
+
 
 class NoConcatDataset(IterableDataset):
     """An IterableDataset that returns text samples for MDSWriter.

diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
@@ -10,6 +10,10 @@
 from llmfoundry import registry
 from llmfoundry.utils.registry_utils import construct_from_registry
 
+__all__ = [
+    'build_dataloader',
+]
+
 
 def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
                      device_batch_size: int) -> DataSpec:

diff --git a/llmfoundry/data/finetuning/__init__.py b/llmfoundry/data/finetuning/__init__.py
@@ -3,5 +3,16 @@
 
 from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
 from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from llmfoundry.data.finetuning.tasks import (StreamingFinetuningDataset,
+                                              dataset_constructor,
+                                              is_valid_ift_example,
+                                              tokenize_formatted_example)
 
-__all__ = ['Seq2SeqFinetuningCollator', 'build_finetuning_dataloader']
+__all__ = [
+    'Seq2SeqFinetuningCollator',
+    'build_finetuning_dataloader',
+    'dataset_constructor',
+    'tokenize_formatted_example',
+    'is_valid_ift_example',
+    'StreamingFinetuningDataset',
+]
diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py
@@ -10,6 +10,10 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'Seq2SeqFinetuningCollator',
+]
+
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100
 

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
@@ -23,6 +23,10 @@
 
 log = logging.getLogger(__name__)
 
+__all__ = [
+    'build_finetuning_dataloader',
+]
+
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100