mosaicml · dblalock · Aug 24, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 11, 2023
@@ -3,5 +3,9 @@
 
 from llmfoundry.optim.adaptive_lion import DecoupledAdaLRLion, DecoupledClipLion
 from llmfoundry.optim.lion import DecoupledLionW
+from llmfoundry.optim.lion8b import DecoupledLionW_8bit
 
-__all__ = ['DecoupledLionW', 'DecoupledClipLion', 'DecoupledAdaLRLion']
+__all__ = [
+    'DecoupledLionW', 'DecoupledLionW_8bit', 'DecoupledClipLion',
+    'DecoupledAdaLRLion'
+]
@@ -26,7 +26,7 @@
                                   LayerFreezing, MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
-                              DecoupledLionW)
+                              DecoupledLionW, DecoupledLionW_8bit)
 
 
 def build_callback(name: str, kwargs: Dict[str, Any]):
@@ -98,6 +98,8 @@ def build_optimizer(model: torch.nn.Module, name: str,
         return DecoupledClipLion(model.parameters(), **optimizer_config)
     elif name == 'adalr_lion':
         return DecoupledAdaLRLion(model.parameters(), **optimizer_config)
+    elif name == 'decoupled_lionw_8b':
+        return DecoupledLionW_8bit(model.parameters(), **optimizer_config)
     else:
         raise ValueError(f'Not sure how to build optimizer: {name}')
 

@@ -4,7 +4,7 @@
 import contextlib
 import math
 import warnings
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Mapping, Optional, Union
 
 from composer.utils import dist
 from omegaconf import DictConfig, ListConfig
@@ -116,6 +116,25 @@ def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]):
             # Set defaults for mixed initialization
             fsdp_config.setdefault('use_orig_params', False)
             fsdp_config.setdefault('load_monolith_rank0_only', True)
+
+    # no mixed precision needed for weights when they're already 16 bits
+    master_dtype = model_cfg.get('master_weights_dtype')
+    small_dtypes = ('bf16', 'f16', 'float16', 'bfloat16', 'amp_fp16',
+                    'amp_bf16')
+    if fsdp_config and master_dtype in small_dtypes:
+        reduce_dtype = None
+        buffer_dtype = None
+        mixed_precision = fsdp_config.get('mixed_precision')
+        if isinstance(mixed_precision, Mapping):
+            reduce_dtype = mixed_precision.get('reduce_dtype')
+            buffer_dtype = mixed_precision.get('buffer_dtype')
+        fsdp_config['mixed_precision'] = {
+            'param_dtype': None,
+            'reduce_dtype': reduce_dtype,
+            'buffer_dtype': buffer_dtype,
+            'keep_low_precision_grads': True,
+        }
+
     return init_context
 
 

@@ -400,6 +400,10 @@ def main(cfg: DictConfig):
             print_trainable_parameters(model)  # should not be 100%
         else:  # standard model
             model = build_composer_model(model_config, tokenizer)
+        if model_config.get('master_weights_dtype') in ('bf16', 'bfloat16'):
+            model = model.to(dtype=torch.bfloat16)
+        elif model_config.get('master_weights_dtype') in ('f16', 'float16'):
+            model = model.to(dtype=torch.float16)
 
     # Log number of parameters
     n_params = sum(p.numel() for p in model.parameters())
@@ -515,5 +519,6 @@ def main(cfg: DictConfig):
         yaml_cfg = om.load(f)
     cli_cfg = om.from_cli(args_list)
     cfg = om.merge(yaml_cfg, cli_cfg)
+    om.resolve(cfg)
     assert isinstance(cfg, DictConfig)
     main(cfg)
@@ -84,6 +84,7 @@
 
 extra_deps['gpu'] = [
     'flash-attn==v1.0.3.post0',
+    'mosaicml-turbo>=0.0.2,<0.1',
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
     'xentropy-cuda-lib@git+https://github.com/HazyResearch/[email protected]#subdirectory=csrc/xentropy',
 ]