Merge branch 'main' into model_gauntlet

mosaicml · Jun 28, 2023 · 266fcea · 266fcea
2 parents 3f22feb + 9235e38
commit 266fcea
Show file tree

Hide file tree

Showing 18 changed files with 337 additions and 168 deletions.
diff --git a/.gitignore b/.gitignore
@@ -120,6 +120,9 @@ ENV/
 env.bak/
 venv.bak/
 
+# python venv installed in the dir, llmfoundry-venv
+*-venv
+
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -143,3 +146,6 @@ dmypy.json
 
 # macOS
 .DS_Store
+
+# notebooks
+notebooks/
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -1,9 +1,9 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-try:
-    import torch
+import torch
 
+try:
     from llmfoundry import optim, utils
     from llmfoundry.data import (ConcatTokensDataset,
                                  MixtureOfDenoisersCollator, NoConcatDataset,
@@ -24,7 +24,7 @@
 
 except ImportError as e:
     try:
-        is_cuda_available = torch.cuda.is_available()  # type: ignore
+        is_cuda_available = torch.cuda.is_available()
     except:
         is_cuda_available = False
 

diff --git a/llmfoundry/callbacks/fdiff_callback.py b/llmfoundry/callbacks/fdiff_callback.py
@@ -47,10 +47,12 @@ def batch_end(self, state: State, logger: Logger):
     def eval_end(self, state: State, logger: Logger):
         if self.diff_eval_metrics:
             evaluator = state.dataloader_label
-            metrics = list(state.eval_metrics[evaluator].keys())  # type: ignore
+            assert evaluator is not None, 'dataloader should have been set'
+
+            metrics = list(state.eval_metrics[evaluator].keys())
 
             for k in metrics:
-                mkey = '/'.join(['metrics', evaluator, k])  # type: ignore
+                mkey = '/'.join(['metrics', evaluator, k])
                 if mkey in self.eval_prev_metric.keys():
                     logger.log_metrics({
                         f'{mkey}_fdiff':
@@ -59,5 +61,5 @@ def eval_end(self, state: State, logger: Logger):
                     })
 
             for k in metrics:
-                mkey = '/'.join(['metrics', evaluator, k])  # type: ignore
+                mkey = '/'.join(['metrics', evaluator, k])
                 self.eval_prev_metric[mkey] = state.eval_metric_values[k]
diff --git a/llmfoundry/callbacks/generate_callback.py b/llmfoundry/callbacks/generate_callback.py
@@ -74,9 +74,9 @@ def generate(self, state: State, logger: Logger):
         dummy_input = device.tensor_to_device(dummy_input)
         with get_precision_context(state.precision):
             with torch.no_grad():
-                _ = model.model(input_ids=dummy_input)  # type: ignore
+                _ = model.model(input_ids=dummy_input)
 
-            output_token_ids = model.model.generate(  # type: ignore
+            output_token_ids = model.model.generate(
                 input_ids=tokenized_input['input_ids'],
                 attention_mask=tokenized_input['attention_mask'],
                 synced_gpus=True,
@@ -85,9 +85,11 @@ def generate(self, state: State, logger: Logger):
 
         if dist.get_global_rank() == 0:
             if self.wandb_logger is not None:
-                artifact = wandb.Artifact(
-                    'generate_samples_' + str(wandb.run.id),  # type: ignore
-                    type='predictions')
+                assert wandb.run is not None, 'wandb should have started run'
+
+                artifact = wandb.Artifact('generate_samples_' +
+                                          str(wandb.run.id),
+                                          type='predictions')
 
                 rows = []
                 for i in range(len(self.prompts)):

diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py
@@ -354,7 +354,7 @@ def build_text_denoising_dataloader(
     cfg: DictConfig,
     tokenizer: Tokenizer,
     device_batch_size: int,
-) -> DataLoader:
+) -> DataLoader[Dict]:
     """Constructor function for a Mixture of Denoisers dataloader.
 
     This function constructs a dataloader that can be used to train an
@@ -480,7 +480,7 @@ def build_text_denoising_dataloader(
         batch_size=device_batch_size,
     )
 
-    if dataset.tokenizer.pad_token is None:  # type: ignore
+    if dataset.tokenizer.pad_token is None:
         dataset.tokenizer.pad_token = dataset.tokenizer.eos_token
 
     if cfg.dataset.get('packing_ratio'):
@@ -564,7 +564,7 @@ def noise_token_sequence(
         else:
             u = np.random.uniform(low=(mask_ratio * 2) - 1, high=1.0)
         mean_span_length = float(np.round(1 + u * (length - 1)))
-        mask_ratio = mean_span_length / length  # type: ignore
+        mask_ratio = mean_span_length / length
         use_sentinels = False
     else:
         use_sentinels = True
@@ -871,9 +871,9 @@ def _format_tokens_for_decoder_only(
     tokenizer = build_tokenizer(tokenizer_cfg)
 
     loader = build_text_denoising_dataloader(cfg, tokenizer, device_batch_size)
+    assert isinstance(loader.dataset, StreamingTextDataset)
 
-    print(
-        f'\n\nTRUNCATING TO: {loader.dataset.max_seq_len}\n\n')  # type: ignore
+    print(f'\n\nTRUNCATING TO: {loader.dataset.max_seq_len}\n\n')
 
     packing = cfg.dataset.get('packing_ratio') is not None
     if packing:

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
@@ -6,6 +6,9 @@
 import os
 from typing import Mapping, Union
 
+# required for loading a python model into composer
+import peft
+import transformers
 from composer.metrics.nlp import (InContextLearningLMAccuracy,
                                   InContextLearningLMExpectedCalibrationError,
                                   InContextLearningMCExpectedCalibrationError,
@@ -30,7 +33,8 @@ class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
     """Configures a :class:`.HuggingFaceModel` around a Causal LM.
 
     Args:
-        cfg (DictConfig): An omegaconf dictionary used to configure the model:
+        om_model_config (DictConfig | peft.peft_model.PeftModel | transformers.PreTrainedModel): either n omegaconf dictionary used to configure the model, or an instantiated model object from the peft or transformers library.
+        if DictConfig, the following keys are required:
             cfg.pretrained_model_name_or_path (str): The name of or local path to
                 the HF Causal LM (e.g., `gpt2` to instantiate a GPT2LMHeadModel).
             cfg.config_overrides (dict, optional): An optional dictionary of keyword
@@ -45,34 +49,12 @@ class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
         tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
     """
 
-    def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
-        trust_remote_code = om_model_config.get('trust_remote_code', True)
-        use_auth_token = om_model_config.get('use_auth_token', False)
-        config = AutoConfig.from_pretrained(
-            om_model_config.pretrained_model_name_or_path,
-            trust_remote_code=trust_remote_code,
-            use_auth_token=use_auth_token,
-        )
-
-        # set config overrides
-        for k, v in om_model_config.get('config_overrides', {}).items():
-            if not hasattr(config, k):
-                raise ValueError(
-                    f'config does not have attribute "{k}" to override ({k}: {v}).'
-                )
-
-            attr = getattr(config, k)
-            if isinstance(attr, Mapping):
-                extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
-                if extra_keys:
-                    raise ValueError(
-                        f'Config dict override got unknown keys. '
-                        f'Extra keys: {extra_keys}. '
-                        f'Expected (a subset of) keys: {list(attr.keys())}.')
-                getattr(config, k).update(v)
-            else:
-                setattr(config, k, v)
+    def __init__(self,
+                 om_model_config: Union[DictConfig, peft.peft_model.PeftModel,
+                                        transformers.PreTrainedModel],
+                 tokenizer: Tokenizer):
 
+        # set up training and eval metrics
         train_metrics = [
             LanguageCrossEntropy(),
             LanguagePerplexity(),
@@ -87,64 +69,116 @@ def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
             InContextLearningMCExpectedCalibrationError()
         ]
 
-        init_device = om_model_config.get('init_device', 'cpu')
-
-        # Get the device we want to initialize, and use the
-        # reolved version to initialize the HF model
-        resolved_init_device = hf_get_init_device(init_device)
-
-        # We need to have all non-zero local ranks be not-pretrained
-        # Rank 0 will still be pretrained, and distribute the weights appropriately
-        if dist.get_local_rank() != 0 and init_device == 'mixed':
-            om_model_config.pretrained = False
-
-        if resolved_init_device == 'cpu':
-            if om_model_config.pretrained:
-                model = AutoModelForCausalLM.from_pretrained(
-                    om_model_config.pretrained_model_name_or_path,
-                    trust_remote_code=trust_remote_code,
-                    use_auth_token=use_auth_token,
-                    config=config)
+        # if we are passed a DictConfig, we need to instantiate the model
+        if isinstance(om_model_config, DictConfig):
+
+            # load the model config
+            trust_remote_code = om_model_config.get('trust_remote_code', True)
+            use_auth_token = om_model_config.get('use_auth_token', False)
+            config = AutoConfig.from_pretrained(
+                om_model_config.pretrained_model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                use_auth_token=use_auth_token,
+            )
+
+            # set config overrides
+            for k, v in om_model_config.get('config_overrides', {}).items():
+                if not hasattr(config, k):
+                    raise ValueError(
+                        f'config does not have attribute "{k}" to override ({k}: {v}).'
+                    )
+
+                attr = getattr(config, k)
+                if isinstance(attr, Mapping):
+                    extra_keys = [
+                        _k for _k in v.keys() if _k not in attr.keys()
+                    ]
+                    if extra_keys:
+                        raise ValueError(
+                            f'Config dict override got unknown keys. '
+                            f'Extra keys: {extra_keys}. '
+                            f'Expected (a subset of) keys: {list(attr.keys())}.'
+                        )
+                    getattr(config, k).update(v)
+                else:
+                    setattr(config, k, v)
+
+            # below we set up the device to initialize the model on
+            init_device = om_model_config.get('init_device', 'cpu')
+
+            # Get the device we want to initialize, and use the
+            # reolved version to initialize the HF model
+            resolved_init_device = hf_get_init_device(init_device)
+
+            # We need to have all non-zero local ranks be not-pretrained
+            # Rank 0 will still be pretrained, and distribute the weights appropriately
+            if dist.get_local_rank() != 0 and init_device == 'mixed':
+                om_model_config.pretrained = False
+
+            # initialize the model on the correct device
+            if resolved_init_device == 'cpu':
+                if om_model_config.pretrained:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        om_model_config.pretrained_model_name_or_path,
+                        trust_remote_code=trust_remote_code,
+                        use_auth_token=use_auth_token,
+                        config=config)
+                else:
+                    model = AutoModelForCausalLM.from_config(
+                        config,
+                        trust_remote_code=trust_remote_code,
+                    )
+            elif resolved_init_device == 'meta':
+                if om_model_config.pretrained:
+                    raise ValueError(
+                        'Setting cfg.pretrained=True is not supported when init_device="meta".'
+                    )
+                with init_empty_weights(include_buffers=False):
+                    model = AutoModelForCausalLM.from_config(
+                        config,
+                        trust_remote_code=trust_remote_code,
+                    )
             else:
-                model = AutoModelForCausalLM.from_config(
-                    config,
-                    trust_remote_code=trust_remote_code,
-                )
-        elif resolved_init_device == 'meta':
-            if om_model_config.pretrained:
                 raise ValueError(
-                    'Setting cfg.pretrained=True is not supported when init_device="meta".'
-                )
-            with init_empty_weights(include_buffers=False):
-                model = AutoModelForCausalLM.from_config(
-                    config,
-                    trust_remote_code=trust_remote_code,
+                    f'init_device="{init_device}" must be either "cpu" or "meta".'
                 )
-        else:
-            raise ValueError(
-                f'init_device="{init_device}" must be either "cpu" or "meta".')
 
-        signal_file_path = '.local_rank0_completed_autoresume'
-        if dist.get_local_rank() == 0:
-            with open(signal_file_path, 'wb') as f:
-                f.write(b'local_rank0_completed_download')
+            signal_file_path = '.local_rank0_completed_autoresume'
+            if dist.get_local_rank() == 0:
+                with open(signal_file_path, 'wb') as f:
+                    f.write(b'local_rank0_completed_download')
+
+            # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
+            # so that we don't timeout for large downloads. This syncs all processes on the node
+            with dist.local_rank_zero_download_and_wait(signal_file_path):
+                # Then, wait to ensure every node has finished downloading the checkpoint
+                dist.barrier()
 
-        # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
-        # so that we don't timeout for large downloads. This syncs all processes on the node
-        with dist.local_rank_zero_download_and_wait(signal_file_path):
-            # Then, wait to ensure every node has finished downloading the checkpoint
-            dist.barrier()
+            if dist.get_local_rank() == 0:
+                os.remove(signal_file_path)
 
-        if dist.get_local_rank() == 0:
-            os.remove(signal_file_path)
+            z_loss = om_model_config.get('z_loss', 0.0)
+
+        # elif the model is either a PeftModel or a PreTrainedModel
+        elif isinstance(
+                om_model_config,
+            (peft.peft_model.PeftModel, transformers.PreTrainedModel)):
+            model = om_model_config
+            init_device = 'cpu'
+            z_loss = 0.0
+
+        # else, unsoported type
+        else:
+            raise ValueError(
+                f'om_model_config must be either a DictConfig, PeftModel, or PreTrainedModel, but got {type(om_model_config)}'
+            )
 
         composer_model = super().__init__(model=model,
                                           shift_labels=True,
                                           tokenizer=tokenizer,
                                           metrics=train_metrics,
                                           eval_metrics=eval_metrics,
-                                          z_loss=om_model_config.get(
-                                              'z_loss', 0.0),
+                                          z_loss=z_loss,
                                           init_device=init_device)
 
         return composer_model
diff --git a/llmfoundry/models/hf/hf_prefix_lm.py b/llmfoundry/models/hf/hf_prefix_lm.py
@@ -98,8 +98,6 @@ def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
         if om_model_config.get('adapt_vocab_for_denoising', False):
             adapt_tokenizer_for_denoising(tokenizer)
 
-        vocab_size = len(tokenizer)
-
         init_device = om_model_config.get('init_device', 'cpu')
 
         # Get the device we want to initialize, and use the

diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py
@@ -91,8 +91,6 @@ def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
         if om_model_config.get('adapt_vocab_for_denoising', False):
             adapt_tokenizer_for_denoising(tokenizer)
 
-        vocab_size = len(tokenizer)
-
         init_device = om_model_config.get('init_device', 'cpu')
 
         # Get the device we want to initialize, and use the

diff --git a/llmfoundry/models/layers/norm.py b/llmfoundry/models/layers/norm.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Dict, Type
+
 import torch
 
 
@@ -107,7 +109,7 @@ def forward(self, x):
                             self.eps).to(dtype=x.dtype)
 
 
-NORM_CLASS_REGISTRY = {
+NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {
     'layernorm': torch.nn.LayerNorm,
     'low_precision_layernorm': LPLayerNorm,
     'rmsnorm': RMSNorm,