Merge branch 'main' into davis/lion8b-v2

mosaicml · Aug 24, 2023 · 8dabf20 · 8dabf20
2 parents 8f84c41 + 2f30418
commit 8dabf20
Show file tree

Hide file tree

Showing 8 changed files with 622 additions and 264 deletions.
diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py
@@ -6,6 +6,8 @@
                                            build_icl_evaluators, build_logger,
                                            build_optimizer, build_scheduler,
                                            build_tokenizer)
+    from llmfoundry.utils.checkpoint_conversion_helpers import (
+        convert_and_save_ft_weights, get_hf_tokenizer_from_composer_state_dict)
     from llmfoundry.utils.config_utils import (calculate_batch_size_info,
                                                log_config, pop_config,
                                                update_batch_size_info)
@@ -23,6 +25,8 @@
     'build_icl_evaluators',
     'build_tokenizer',
     'calculate_batch_size_info',
+    'convert_and_save_ft_weights',
+    'get_hf_tokenizer_from_composer_state_dict',
     'update_batch_size_info',
     'log_config',
     'pop_config',

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -0,0 +1,295 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helper methods for the checkpoint conversion scripts.
+
+The checkpoint conversion scripts are located in the
+llmfoundry/scripts/inference/benchmarking/ folder. Users should run those
+scripts directly to convert between checkpoints; this file contains only common
+utility functions that are present in multiple scripts.
+"""
+
+import json
+import os
+import random
+import string
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import sentencepiece as spm
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+def _get_weight_data_type(data_type: str):
+    if data_type == 'fp32':
+        return np.float32
+    elif data_type == 'fp16':
+        return np.float16
+    else:
+        raise RuntimeError('Unsupported data type: {data_type} for conversion.')
+
+
+# TODO: move this functionality to composer once the bug fixes are upstreamed
+def get_hf_tokenizer_from_composer_state_dict(
+        state_dict: Dict[str, Any],
+        tokenizer_save_dir: Optional[str] = None
+) -> Optional[PreTrainedTokenizer]:
+    if 'state' not in state_dict:
+        raise RuntimeError(
+            'Unexpected composer state dictionary. Did you pass in a full composer checkpoint?'
+        )
+    if 'integrations' not in state_dict[
+            'state'] or 'huggingface' not in state_dict['state']['integrations']:
+        raise RuntimeError(
+            'Did not find HuggingFace related state (e.g., tokenizer) in the provided composer checkpoint!'
+        )
+    hf_tokenizer_state = state_dict['state']['integrations']['huggingface'][
+        'tokenizer']
+    hf_tokenizer = None
+    if hf_tokenizer_state != {}:
+        if tokenizer_save_dir is None:
+            unique_suffix = ''.join(
+                random.choices(string.ascii_letters + string.digits, k=6))
+            tokenizer_save_dir = os.path.join(
+                os.getcwd(), f'tokenizer-save-dir-{unique_suffix}')
+        os.makedirs(tokenizer_save_dir, exist_ok=True)
+
+        for filename, saved_content in hf_tokenizer_state.items():
+            # This cannot be a temporary directory because huggingface relies on the slow tokenizer file
+            # being persistent on disk
+            tokenizer_file_path = Path(
+                tokenizer_save_dir
+            ) / f'{filename}{saved_content["file_extension"]}'
+            if saved_content['file_extension'] == '.json':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    json.dump(saved_content['content'], _tmp_file)
+            elif saved_content['file_extension'] == '.txt':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    for line in saved_content['content']:
+                        _tmp_file.write(line)
+                        _tmp_file.write('\n')
+            elif saved_content['file_extension'] == '.py':
+                with open(tokenizer_file_path, 'w') as _tmp_file:
+                    _tmp_file.write(saved_content['content'])
+            elif saved_content['file_extension'] == '.model':
+                s = spm.SentencePieceProcessor()
+                s.load_from_serialized_proto(saved_content['content'])
+                with open(tokenizer_file_path, 'wb') as _tmp_file:
+                    _tmp_file.write(s.serialized_model_proto())
+
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
+
+        # remove 'name_or_path'
+        hf_tokenizer.name_or_path = ''
+        hf_tokenizer.init_kwargs['name_or_path'] = ''
+
+    return hf_tokenizer
+
+
+def _write_zero_bias(weight_name: str, weight_file_path: str,
+                     bias_shape: Union[Tuple[int, ...], int]) -> None:
+    """Write zeros for bias when converting MPT to FasterTransformer weights.
+
+    MPT model might not have bias while FT expects bias.
+
+    Args:
+        weight_name (str): Name of the weight tensor.
+        weight_file_path (str): Output path for storing the weight (NOT zero bias).
+        bias_shape (Union[Tuple[int, ...], int]): Shape of the bias array.
+    """
+    if 'weight' not in weight_file_path:
+        raise RuntimeError(
+            f'Cannot write zero bias for {weight_name}. Input is not a weight tensor'
+        )
+    print(f'zero bias for weight: {weight_name}')
+    bias_file_path = weight_file_path.replace('.weight', '.bias')
+    bias = np.zeros(bias_shape, dtype=np.float32)
+    bias.tofile(bias_file_path)
+
+
+def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
+                               tensor_name: str, config: Dict[str, Any],
+                               data: np.ndarray):
+    """Convert each MPT weight to a FasterTransformer compatible format.
+
+    Args:
+        save_dir (str): Path of the directory to save the weight in FT format. The directory must already exist.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        tensor_name (str): Name of the weight tensor. Used in naming the output file.
+        config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters.
+        data (np.ndarray): Tensor data in np.ndarray format.
+
+    Returns:
+        None: Writes to a file in `save_dir`. File name is based on the `tensor_name`
+    """
+    if tensor_name.find('input_layernorm.weight') != -1 or tensor_name.find('input_layernorm.bias') != -1 or \
+        tensor_name.find('attention.dense.bias') != -1 or tensor_name.find('post_attention_layernorm.weight') != -1 or \
+        tensor_name.find('post_attention_layernorm.bias') != -1 or tensor_name.find('mlp.dense_4h_to_h.bias') != -1 or \
+        tensor_name.find('final_layernorm.weight') != -1 or tensor_name.find('final_layernorm.bias') != -1:
+
+        save_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
+        data.tofile(save_path)
+        if 'weight' in tensor_name and config['no_bias']:
+            _write_zero_bias(tensor_name, save_path, data.shape[-1])
+
+    elif tensor_name.find('attention.dense.weight') != -1:
+        assert data.shape == (
+            config['d_model'],
+            config['d_model']), f'unexpected dim for {tensor_name}'
+        # nn.Linear weights are transposed
+        data = data.T
+        split_vals = np.split(data, infer_gpu_num, axis=0)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+        if config['no_bias']:
+            fake_weight_path = os.path.join(save_dir,
+                                            f'model.{tensor_name}.bin')
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1])
+
+    elif tensor_name.find('mlp.dense_4h_to_h.weight') != -1:
+        assert data.shape == (
+            config['d_model'], config['mlp_ratio'] *
+            config['d_model']), f'unexpected dim for {tensor_name}'
+        # nn.Linear weights are transposed
+        data = data.T
+        split_vals = np.split(data, infer_gpu_num, axis=0)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+        if config['no_bias']:
+            fake_weight_path = os.path.join(save_dir,
+                                            f'model.{tensor_name}.bin')
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1])
+
+    elif tensor_name.find('mlp.dense_h_to_4h.weight') != -1:
+        assert data.shape == (
+            config['mlp_ratio'] * config['d_model'],
+            config['d_model']), f'unexpected dim for {tensor_name}'
+        # nn.Linear weights are transposed
+        data = data.T
+
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(tensor_name, save_path,
+                                 split_vals[j].shape[-1])
+
+    elif tensor_name.find('mlp.dense_h_to_4h.bias') != -1:
+        assert data.shape == (
+            config['mlp_ratio'] *
+            config['d_model'],), f'unexpected dim for {tensor_name}'
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir + f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+
+    elif tensor_name.find('attention.query_key_value.bias') != -1:
+        assert data.shape == (
+            3 * config['d_model'],), f'unexpected dim for {tensor_name}'
+
+        data = data.reshape(3, config['d_model'])
+
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+
+    elif tensor_name.find('attention.query_key_value.weight') != -1:
+        assert data.shape == (
+            3 * config['d_model'],
+            config['d_model']), f'unexpected dim for {tensor_name}'
+        # nn.Linear weights are transposed
+        data = data.T
+
+        data = data.reshape(config['d_model'], 3, config['d_model'])
+        split_vals = np.split(data, infer_gpu_num, axis=-1)
+
+        for j in range(infer_gpu_num):
+            save_path = os.path.join(save_dir, f'model.{tensor_name}.{j}.bin')
+            split_vals[j].tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(tensor_name, save_path,
+                                 (3, split_vals[j].shape[-1]))
+
+    else:
+        raise RuntimeError(f'Tensor with name {tensor_name} is not handled')
+
+
+def convert_and_save_ft_weights(named_params: dict,
+                                config: dict,
+                                infer_gpu_num: int = 1,
+                                weight_data_type: str = 'fp32',
+                                save_dir: str = ''):
+    """Convert a Composer MPT checkpoint to a FasterTransformer format.
+
+    Args:
+        named_params (Dict[str, Parameter]): A dictionary containing the Composer MPT model's parameter names and data.
+        config (Dict[str, Any]): Configuration for the model. This is used in getting model specific parameters.
+        infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        weight_data_type (str): The dtype of the converted FasterTransformer model.
+        save_dir (str): Path of the directory to save the weight in FT format. The directory must already exist.
+
+    Returns:
+        None: Writes to the `save_dir` folder. File names within this folder are based on the model parameter names.
+    """
+    np_weight_data_type = _get_weight_data_type(weight_data_type)
+
+    param_remapping = {
+        'norm_1.bias': 'input_layernorm.bias',
+        'norm_1.weight': 'input_layernorm.weight',
+        'attn.Wqkv.bias': 'attention.query_key_value.bias',
+        'attn.Wqkv.weight': 'attention.query_key_value.weight',
+        'attn.out_proj.bias': 'attention.dense.bias',
+        'attn.out_proj.weight': 'attention.dense.weight',
+        'norm_2.bias': 'post_attention_layernorm.bias',
+        'norm_2.weight': 'post_attention_layernorm.weight',
+        'ffn.up_proj.bias': 'mlp.dense_h_to_4h.bias',
+        'ffn.up_proj.weight': 'mlp.dense_h_to_4h.weight',
+        'ffn.down_proj.bias': 'mlp.dense_4h_to_h.bias',
+        'ffn.down_proj.weight': 'mlp.dense_4h_to_h.weight',
+    }
+
+    for name, param in named_params.items():
+        print(f'Working on parameter {name} ...')
+        data = param.detach().cpu().numpy().astype(np_weight_data_type)
+        if name.find('weight') == -1 and name.find('bias') == -1:
+            print(f'found a parameter name that is not handled: {name}')
+            continue
+        if name == 'transformer.wpe.weight':
+            assert data.shape == (
+                config['max_seq_len'],
+                config['d_model']), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir, 'model.wpe.bin'))
+        elif name == 'transformer.wte.weight':
+            assert data.shape == (
+                config['vocab_size'],
+                config['d_model']), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir, 'model.wte.bin'))
+        elif name == 'transformer.norm_f.bias':
+            assert data.shape == (
+                config['d_model'],), f'unexpected dim for {name}'
+            data.tofile(os.path.join(save_dir,
+                                     'model.final_layernorm.bias.bin'))
+        elif name == 'transformer.norm_f.weight':
+            assert data.shape == (
+                config['d_model'],), f'unexpected dim for {name}'
+            save_path = os.path.join(save_dir,
+                                     'model.final_layernorm.weight.bin')
+            data.tofile(save_path)
+            if config['no_bias']:
+                _write_zero_bias(name, save_path, data.shape[-1])
+        elif name == 'transformer.lm_head.weight':
+            data.tofile(os.path.join(save_dir, 'model.lm_head.weight.bin'))
+        else:
+            for mpt_pattern, ft_pattern in param_remapping.items():
+                if name.find(mpt_pattern) != -1:
+                    new_name = name.replace('transformer.blocks.',
+                                            'layers.').replace(
+                                                mpt_pattern, ft_pattern)
+                    _convert_weight_to_ft_each(save_dir, infer_gpu_num,
+                                               new_name, config, data)
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
@@ -97,8 +97,8 @@ def evaluate_model(model_cfg: DictConfig, dist_timeout: Union[float, int],
                    max_seq_len: int, device_eval_batch_size: int,
                    model_gauntlet_config: Optional[Union[str, DictConfig]],
                    fsdp_config: Optional[Dict], num_retries: int,
-                   loggers_cfg: Dict[str, Any], precision: str,
-                   model_gauntlet_df: Optional[pd.DataFrame]):
+                   loggers_cfg: Dict[str, Any], python_log_level: str,
+                   precision: str, model_gauntlet_df: Optional[pd.DataFrame]):
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
     # Build tokenizer and model
     tokenizer = build_tokenizer(model_cfg.tokenizer)
@@ -154,6 +154,7 @@ def evaluate_model(model_cfg: DictConfig, dist_timeout: Union[float, int],
         progress_bar=False,
         log_to_console=True,
         dist_timeout=dist_timeout,
+        python_log_level=python_log_level,
     )
 
     if torch.cuda.is_available():
@@ -191,6 +192,10 @@ def main(cfg: DictConfig):
                                              'device_eval_batch_size',
                                              must_exist=True)
     precision: str = pop_config(cfg, 'precision', must_exist=True)
+    python_log_level: str = pop_config(cfg,
+                                       'python_log_level',
+                                       must_exist=False,
+                                       default_value='debug')
 
     # Optional Evaluation Parameters with default values
     seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17)
@@ -240,8 +245,10 @@ def main(cfg: DictConfig):
              fsdp_config=fsdp_config,
              num_retries=num_retries,
              loggers_cfg=loggers_cfg,
+             python_log_level=python_log_level,
              precision=precision,
-             model_gauntlet_df=model_gauntlet_df)
+             model_gauntlet_df=model_gauntlet_df,
+         )
 
         if model_gauntlet_callback is not None:
             # TODO(bmosaicml) This needs to be refactored to fix the typing issue

diff --git a/scripts/inference/README.md b/scripts/inference/README.md
@@ -199,6 +199,12 @@ python convert_hf_mpt_to_ft.py -i mpt-7b -o mpt-ft-7b --infer_gpu_num 1
 ```
 You can change `infer_gpu_num` to > 1 to prepare a FT checkpoint for multi-gpu inference. Please open a Github issue if you discover any problems!
 
+## Converting a Composer MPT to FasterTransformer
+We include a script `convert_composer_mpt_to_ft.py` that directly converts a Composer MPT checkpoint to the FasterTransformer format. You can either provide a path to a local Composer checkpoint or a URI to a file stored in a cloud supported by Composer (e.g. `s3://`). Simply run:
+```
+python convert_composer_mpt_to_ft.py -i <path_to_composer_checkpoint.pt> -o mpt-ft-7b --infer_gpu_num 1
+```
+
 ## Running MPT with FasterTransformer
 This step assumes that you already have converted an MPT checkpoint to FT format by following the instructions in
 [Converting an HF MPT to FasterTransformer](#converting-an-hf-mpt-to-fastertransformer). It also assumes that you have