mosaicml · dblalock · Aug 24, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 11, 2023
@@ -3,5 +3,9 @@
 
 from llmfoundry.optim.adaptive_lion import DecoupledAdaLRLion, DecoupledClipLion
 from llmfoundry.optim.lion import DecoupledLionW
+from llmfoundry.optim.lion8b import DecoupledLionW_8bit
 
-__all__ = ['DecoupledLionW', 'DecoupledClipLion', 'DecoupledAdaLRLion']
+__all__ = [
+    'DecoupledLionW', 'DecoupledLionW_8bit', 'DecoupledClipLion',
+    'DecoupledAdaLRLion'
+]
@@ -26,7 +26,7 @@
                                   LayerFreezing, MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
-                              DecoupledLionW)
+                              DecoupledLionW, DecoupledLionW_8bit)
 
 
 def build_callback(name: str, kwargs: Dict[str, Any]):
@@ -115,6 +115,10 @@ def build_optimizer(cfg: DictConfig, model: torch.nn.Module):
                                   timeout=cfg.timeout,
                                   lr_penalty=cfg.lr_penalty,
                                   min_scale=cfg.min_scale)
+    elif cfg.name.lower() == 'decoupled_lionw_8b':
+        # str() cast is just for pyright
+        kwargs = {str(k): v for k, v in cfg.items() if k != 'name'}
+        return DecoupledLionW_8bit(model.parameters(), **kwargs)
     else:
         raise ValueError(f'Not sure how to build optimizer: {cfg.name}')
 

@@ -4,7 +4,7 @@
 import contextlib
 import math
 import warnings
-from typing import Dict, Optional, Union
+from typing import Dict, Mapping, Optional, Union
 
 from composer.utils import dist
 from omegaconf import DictConfig
@@ -86,6 +86,25 @@ def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]):
             # Set defaults for mixed initialization
             fsdp_config.setdefault('use_orig_params', False)
             fsdp_config.setdefault('load_monolith_rank0_only', True)
+
+    # no mixed precision needed for weights when they're already 16 bits
+    master_dtype = model_cfg.get('master_weights_dtype')
+    small_dtypes = ('bf16', 'f16', 'float16', 'bfloat16', 'amp_fp16',
+                    'amp_bf16')
+    if fsdp_config and master_dtype in small_dtypes:
+        reduce_dtype = None
+        buffer_dtype = None
+        mixed_precision = fsdp_config.get('mixed_precision')
+        if isinstance(mixed_precision, Mapping):
+            reduce_dtype = mixed_precision.get('reduce_dtype')
+            buffer_dtype = mixed_precision.get('buffer_dtype')
+        fsdp_config['mixed_precision'] = {
+            'param_dtype': None,
+            'reduce_dtype': reduce_dtype,
+            'buffer_dtype': buffer_dtype,
+            'keep_low_precision_grads': True,
+        }
+
     return init_context
 
 

@@ -194,15 +194,16 @@ def main(cfg: DictConfig):
     cfg = update_batch_size_info(cfg)
 
     # Read FSDP Config as a dict
-    fsdp_config = cfg.get('fsdp_config', None)
-    fsdp_config = om.to_container(fsdp_config,
-                                  resolve=True) if fsdp_config else None
-    assert isinstance(fsdp_config, Dict) or fsdp_config is None
-    if dist.get_world_size() == 1 and fsdp_config is not None:
-        warnings.warn(
-            'FSDP is not applicable for single-GPU training. Reverting to DDP.')
-        cfg.pop('fsdp_config')
-        fsdp_config = None
+    fsdp_config = cfg.get('fsdp_config')
+    if fsdp_config is not None:
+        fsdp_config = om.to_container(fsdp_config, resolve=True)
+        assert isinstance(fsdp_config, Dict)
+        if dist.get_world_size() == 1:
+            warnings.warn(
+                'FSDP is not applicable for single-GPU training. Reverting to DDP.'
+            )
+            cfg.pop('fsdp_config')
+            fsdp_config = None
 
     init_context = process_init_device(cfg.model, fsdp_config)
 
@@ -212,13 +213,16 @@ def main(cfg: DictConfig):
     # Build Model
     print('Initializing model...')
     with init_context:
-        if cfg.get('lora',
-                   None) is not None:  # frozen model + trainable lora modules
+        if cfg.get('lora') is not None:  # frozen model + trainable lora modules
             model: ComposerHFCausalLM = build_composer_peft_model(
                 cfg.model, cfg.lora, tokenizer)
             print_trainable_parameters(model)  # should not be 100%
         else:  # standard model
             model = build_composer_model(cfg.model, tokenizer)
+        if cfg.model.get('master_weights_dtype') in ('bf16', 'bfloat16'):
+            model = model.to(dtype=torch.bfloat16)
+        elif cfg.model.get('master_weights_dtype') in ('f16', 'float16'):
+            model = model.to(dtype=torch.float16)
     cfg.n_params = sum(p.numel() for p in model.parameters())
     print(f'{cfg.n_params=:.2e}')
 
@@ -342,5 +346,6 @@ def main(cfg: DictConfig):
         yaml_cfg = om.load(f)
     cli_cfg = om.from_cli(args_list)
     cfg = om.merge(yaml_cfg, cli_cfg)
+    om.resolve(cfg)
     assert isinstance(cfg, DictConfig)
     main(cfg)
@@ -83,6 +83,7 @@
 
 extra_deps['gpu'] = [
     'flash-attn==v1.0.3.post0',
+    'mosaicml-turbo>=0.0.2,<0.1',
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
     'xentropy-cuda-lib@git+https://github.com/HazyResearch/[email protected]#subdirectory=csrc/xentropy',
 ]