mosaicml · danbider · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
@@ -330,19 +330,43 @@ The majority of our training setups use `triton`. -->
 
 
 ### Can I finetune using PEFT / LoRA?
-- The LLM Foundry codebase does not directly have examples of PEFT or LORA workflows. However, our MPT model is a subclass of HuggingFace `PretrainedModel`, and https://github.com/mosaicml/llm-foundry/pull/346 added required features to enable HuggingFace’s [PEFT](https://huggingface.co/docs/peft/index) / [LORA](https://huggingface.co/docs/peft/conceptual_guides/lora) workflows for MPT. MPT models with LoRA modules can be trained either using LLM Foundry or Hugging Face's [accelerate](https://huggingface.co/docs/accelerate/index). Within LLM Foundry, run (`scripts/train/train.py`), adding `lora` arguments to the config `.yaml`, like so:
+- The LLM Foundry codebase does not directly have examples of PEFT or LORA workflows. However, our MPT model is a subclass of HuggingFace `PretrainedModel`, and https://github.com/mosaicml/llm-foundry/pull/346 added required features to enable HuggingFace’s [PEFT](https://huggingface.co/docs/peft/index) / [LORA](https://huggingface.co/docs/peft/conceptual_guides/lora) workflows for MPT. MPT models with LoRA modules can be trained either using LLM Foundry or Hugging Face's [accelerate](https://huggingface.co/docs/accelerate/index). Within LLM Foundry, run (`scripts/train/train.py`), adding `model.lora` arguments to the config `.yaml`, like so:
 <!--pytest.mark.skip-->
 ```yaml
-lora:
-  args:
-    r: 16
-    lora_alpha: 32
-    lora_dropout: 0.05
-    target_modules: ['Wqkv']
+model:
+  name: hf_causal_lm
+  pretrained: true
+  ...
+  lora:
+    args:
+      r: 16
+      lora_alpha: 32
+      target_modules: ["Wqkv", "out_proj", "up_proj", "down_proj"]
+      lora_dropout: 0.05
+      bias: none
+      task_type: "CAUSAL_LM"
+```
+You can train LoRA models either using FSDP for further memory savings. in your `.yaml`, specify:
+<!--pytest.mark.skip-->
+```yaml
+fsdp_config:
+  use_orig_params: true
+  sharding_strategy: FULL_SHARD
+  mixed_precision: PURE
+  activation_checkpointing: true
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: false
+  limit_all_gathers: true
 ```
+or default to DDP, as follows:
+<!--pytest.mark.skip-->
+```yaml
+fsdp:
+  {}
+```
+
 - In the current release, these features have Beta support.
 - For efficiency, The MPT model concatenates the `Q`, `K`, and `V` matrices in each attention block into a single `Wqkv` matrix that is three times wider. Currently, LoRA supports a low-rank approximation to this `Wqkv` matrix.
-- Known issue: PEFT / LoRA do not directly work with FSDP.
 
 ### Can I quantize these models and/or run on CPU?
 - The LLM Foundry codebase does not directly have examples of quantization or limited-resource inference. But you can check out [GGML](https://github.com/ggerganov/ggml) (same library that powers llama.cpp) which has built support for efficiently running MPT models on CPU! You _can_ load your model in 8-bit precision for inference using the [bitsandbytes library](https://github.com/TimDettmers/bitsandbytes) and Hugging Face's [accelerate](https://huggingface.co/docs/accelerate/index) via `load model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto", trust_remote_code=True)`, although we have not extensively benchmarked the performance (see the Hugging Face [quantization documentation](https://huggingface.co/docs/transformers/main/main_classes/quantization) for more detail).

@@ -5,6 +5,7 @@
 
 import os
 from typing import Mapping, Union
+from warnings import warn
 
 # required for loading a python model into composer
 import transformers
@@ -24,26 +25,37 @@
 from llmfoundry.models.utils import init_empty_weights
 
 try:
-    from peft.peft_model import PeftModel
-    model_types = PeftModel, transformers.PreTrainedModel
-    _om_model_config_type = Union[DictConfig, PeftModel,
-                                  transformers.PreTrainedModel]
+    from peft import LoraConfig, get_peft_model
+    _peft_installed = True
 
 except ImportError:
-    model_types = transformers.PreTrainedModel
-    _om_model_config_type = Union[DictConfig, transformers.PreTrainedModel]
+    # raising warnings below only if users try to use PEFT
+    _peft_installed = False
 
 __all__ = ['ComposerHFCausalLM']
 
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 
 
+def print_trainable_parameters(model) -> None:
+    # Prints the number of trainable parameters in the model.
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}'
+    )
+
+
 class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
     """Configures a :class:`.HuggingFaceModel` around a Causal LM.
 
     Args:
-        om_model_config (DictConfig | PeftModel | transformers.PreTrainedModel): either n omegaconf dictionary used to configure the model, or an instantiated model object from the peft or transformers library.
-        if DictConfig, the following keys are required:
+        om_model_config (DictConfig): an omegaconf dictionary used to configure the model.
+        the following keys are required:
             cfg.pretrained_model_name_or_path (str): The name of or local path to
                 the HF Causal LM (e.g., `gpt2` to instantiate a GPT2LMHeadModel).
             cfg.config_overrides (dict, optional): An optional dictionary of keyword
@@ -58,8 +70,7 @@ class ComposerHFCausalLM(HuggingFaceModelWithZLoss):
         tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
     """
 
-    def __init__(self, om_model_config: _om_model_config_type,
-                 tokenizer: Tokenizer):
+    def __init__(self, om_model_config: DictConfig, tokenizer: Tokenizer):
 
         # set up training and eval metrics
         train_metrics = [
@@ -76,108 +87,106 @@ def __init__(self, om_model_config: _om_model_config_type,
             InContextLearningMCExpectedCalibrationError()
         ]
 
-        # if we are passed a DictConfig, we need to instantiate the model
-        if isinstance(om_model_config, DictConfig):
-
-            # load the model config
-            trust_remote_code = om_model_config.get('trust_remote_code', True)
-            use_auth_token = om_model_config.get('use_auth_token', False)
-            config = AutoConfig.from_pretrained(
-                om_model_config.pretrained_model_name_or_path,
-                trust_remote_code=trust_remote_code,
-                use_auth_token=use_auth_token,
-            )
-
-            # set config overrides
-            for k, v in om_model_config.get('config_overrides', {}).items():
-                if not hasattr(config, k):
-                    raise ValueError(
-                        f'config does not have attribute "{k}" to override ({k}: {v}).'
-                    )
-
-                attr = getattr(config, k)
-                if isinstance(attr, Mapping):
-                    extra_keys = [
-                        _k for _k in v.keys() if _k not in attr.keys()
-                    ]
-                    if extra_keys:
-                        raise ValueError(
-                            f'Config dict override got unknown keys. '
-                            f'Extra keys: {extra_keys}. '
-                            f'Expected (a subset of) keys: {list(attr.keys())}.'
-                        )
-                    getattr(config, k).update(v)
-                else:
-                    setattr(config, k, v)
-
-            # below we set up the device to initialize the model on
-            init_device = om_model_config.get('init_device', 'cpu')
-
-            # Get the device we want to initialize, and use the
-            # reolved version to initialize the HF model
-            resolved_init_device = hf_get_init_device(init_device)
-
-            # We need to have all non-zero local ranks be not-pretrained
-            # Rank 0 will still be pretrained, and distribute the weights appropriately
-            if dist.get_local_rank() != 0 and init_device == 'mixed':
-                om_model_config.pretrained = False
-
-            # initialize the model on the correct device
-            if resolved_init_device == 'cpu':
-                if om_model_config.pretrained:
-                    model = AutoModelForCausalLM.from_pretrained(
-                        om_model_config.pretrained_model_name_or_path,
-                        trust_remote_code=trust_remote_code,
-                        use_auth_token=use_auth_token,
-                        config=config)
-                else:
-                    model = AutoModelForCausalLM.from_config(
-                        config,
-                        trust_remote_code=trust_remote_code,
-                    )
-            elif resolved_init_device == 'meta':
-                if om_model_config.pretrained:
+        # load the model config
+        trust_remote_code = om_model_config.get('trust_remote_code', True)
+        use_auth_token = om_model_config.get('use_auth_token', False)
+        config = AutoConfig.from_pretrained(
+            om_model_config.pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            use_auth_token=use_auth_token,
+        )
+
+        # set config overrides
+        for k, v in om_model_config.get('config_overrides', {}).items():
+            if not hasattr(config, k):
+                raise ValueError(
+                    f'config does not have attribute "{k}" to override ({k}: {v}).'
+                )
+
+            attr = getattr(config, k)
+            if isinstance(attr, Mapping):
+                extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
+                if extra_keys:
                     raise ValueError(
-                        'Setting cfg.pretrained=True is not supported when init_device="meta".'
-                    )
-                with init_empty_weights(include_buffers=False):
-                    model = AutoModelForCausalLM.from_config(
-                        config,
-                        trust_remote_code=trust_remote_code,
-                    )
+                        f'Config dict override got unknown keys. '
+                        f'Extra keys: {extra_keys}. '
+                        f'Expected (a subset of) keys: {list(attr.keys())}.')
+                getattr(config, k).update(v)
+            else:
+                setattr(config, k, v)
+
+        # below we set up the device to initialize the model on
+        init_device = om_model_config.get('init_device', 'cpu')
+
+        # Get the device we want to initialize, and use the
+        # reolved version to initialize the HF model
+        resolved_init_device = hf_get_init_device(init_device)
+
+        # We need to have all non-zero local ranks be not-pretrained
+        # Rank 0 will still be pretrained, and distribute the weights appropriately
+        if dist.get_local_rank() != 0 and init_device == 'mixed':
+            om_model_config.pretrained = False
+
+        # initialize the model on the correct device
+        if resolved_init_device == 'cpu':
+            if om_model_config.pretrained:
+                model = AutoModelForCausalLM.from_pretrained(
+                    om_model_config.pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    use_auth_token=use_auth_token,
+                    config=config)
             else:
+                model = AutoModelForCausalLM.from_config(
+                    config,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif resolved_init_device == 'meta':
+            if om_model_config.pretrained:
                 raise ValueError(
-                    f'init_device="{init_device}" must be either "cpu" or "meta".'
+                    'Setting cfg.pretrained=True is not supported when init_device="meta".'
+                )
+            with init_empty_weights(include_buffers=False):
+                model = AutoModelForCausalLM.from_config(
+                    config,
+                    trust_remote_code=trust_remote_code,
                 )
-
-            signal_file_path = '.local_rank0_completed_autoresume'
-            if dist.get_local_rank() == 0:
-                with open(signal_file_path, 'wb') as f:
-                    f.write(b'local_rank0_completed_download')
-
-            # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
-            # so that we don't timeout for large downloads. This syncs all processes on the node
-            with dist.local_rank_zero_download_and_wait(signal_file_path):
-                # Then, wait to ensure every node has finished downloading the checkpoint
-                dist.barrier()
-
-            if dist.get_local_rank() == 0:
-                os.remove(signal_file_path)
-
-            z_loss = om_model_config.get('z_loss', 0.0)
-
-        # elif the model is either a PeftModel or a PreTrainedModel
-        elif isinstance(om_model_config, model_types):
-            model = om_model_config
-            init_device = 'cpu'
-            z_loss = 0.0
-
-        # else, unsupported type
         else:
             raise ValueError(
-                f'om_model_config must be either a DictConfig, PeftModel, or PreTrainedModel, but got {type(om_model_config)}'
-            )
+                f'init_device="{init_device}" must be either "cpu" or "meta".')
+
+        signal_file_path = '.local_rank0_completed_autoresume'
+        if dist.get_local_rank() == 0:
+            with open(signal_file_path, 'wb') as f:
+                f.write(b'local_rank0_completed_download')
+
+        # Avoid the collective call until the local rank zero has finished trying to download the checkpoint
+        # so that we don't timeout for large downloads. This syncs all processes on the node
+        with dist.local_rank_zero_download_and_wait(signal_file_path):
+            # Then, wait to ensure every node has finished downloading the checkpoint
+            dist.barrier()
+
+        if dist.get_local_rank() == 0:
+            os.remove(signal_file_path)
+
+        z_loss = om_model_config.get('z_loss', 0.0)
+
+        # if om_model_config includes lora and peft is installed, add lora modules
+        lora_cfg = om_model_config.get('lora', None)
+        if lora_cfg is not None:
+            if _peft_installed == True:
+                print('Building Lora config...')
+                lora_cfg = LoraConfig(**lora_cfg.args)
+                print('Lora config built.')
+                print('Adding Lora modules...')
+                model = get_peft_model(model, lora_cfg)
+                print('Lora modules added.')
+                print_trainable_parameters(model)
+            else:
+                warn(
+                    "cfg.model.lora is given but PEFT not installed, so not building a PEFT model. Execute pip install -e \".[gpu,peft]\" and try again."
+                )
 
+        print('Wrapping model in composer...')
         composer_model = super().__init__(model=model,
                                           shift_labels=True,
                                           tokenizer=tokenizer,

@@ -5,12 +5,20 @@
 # which is MIT licensed
 
 import functools
+import warnings
 from typing import Any, Iterable, List
 
 import torch
 from transformers import PreTrainedModel
 from transformers.models.opt.modeling_opt import OPTDecoder
 
+try:
+    from peft import LoraModel
+    lora_model_type = LoraModel
+except ImportError:
+    lora_model_type = None
+    warnings.warn('peft is not installed, LoraModel will not be available')
+
 
 # helper functions
 def rhasattr(obj: Any, attr: str):
@@ -182,6 +190,16 @@ def prepare_hf_causal_lm_model_for_fsdp(model: PreTrainedModel,
             tied_embeddings._fsdp_wrap = False  # type: ignore
             lm_head._fsdp_wrap = False  # type: ignore
 
+    # applying ._fsdp_wrap = True for the LoRA modules
+    # this is needed because added LoRA modules have requires_grad=True,
+    # while the rest of the modules have requires_grad=False
+    if lora_model_type is not None:  # peft is installed
+        if isinstance(model.base_model,
+                      lora_model_type):  # we have builR a LoraModel
+            for name, module in model_block.named_modules():
+                if 'lora' in name:  # peft adds modules named with lora
+                    module._fsdp_wrap = True
+
     # FSDP Wrap and Activation Checkpoint every model block
     model.fsdp_wrap_fn = lambda module: isinstance(module, block_type)
     model.activation_checkpointing_fn = lambda module: isinstance(