Add starcoder

huggingface · Aug 17, 2023 · c344cec · c344cec
1 parent 8b254e9
commit c344cec
Show file tree

Hide file tree

Showing 8 changed files with 1,224 additions and 152 deletions.
diff --git a/optimum/exporters/ggml/__main__.py b/optimum/exporters/ggml/__main__.py
@@ -20,12 +20,11 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import numpy as np
 import torch
 from transformers import AutoConfig, AutoTokenizer
 
 from optimum.commands.export.ggml import parse_args_ggml
-from optimum.exporters.ggml.utils import infer_task
+from optimum.exporters.ggml.utils import bytes_to_unicode, infer_task
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import logging
 
@@ -94,63 +93,62 @@ def main_export(
         task=task,
     )
 
-    conv_map = ggml_config.CONV_MAP
-
     fname_out = os.path.join(output, f"ggml-model-{model_name_or_path.split('/')[-1]}-{ftype_str[ftype]}.bin")
     fout = open(fname_out, "wb")
 
+    # Hardcoded for Bloom TODO remove as argument in cpp and hardcode there so hparam can be removed
     hparams["multiple_of"] = 1
+
+    vocab_size = hparams["vocab_size"]
+
     fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
-    fout.write(struct.pack("i", hparams["vocab_size"]))
-    fout.write(struct.pack("i", hparams["n_positions"]))
-    fout.write(struct.pack("i", hparams["hidden_size"]))
-    fout.write(struct.pack("i", hparams["multiple_of"]))
-    fout.write(struct.pack("i", hparams["n_head"]))
-    fout.write(struct.pack("i", hparams["n_layer"]))
+    fout.write(struct.pack("i", vocab_size))
+    for key in ggml_config.STRUCT_HPARAM_KEYS:
+        fout.write(struct.pack("i", hparams[key]))
     fout.write(struct.pack("i", ftype))
 
-    for i in range(hparams["vocab_size"]):
-        text = tokenizer.decode([i]).encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    if ggml_config.USE_BYTE_DECODER:
+        byte_encoder = bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        encoder = tokenizer.vocab
+
+        fout.write(struct.pack("i", vocab_size))
+
+        for key in sorted(encoder, key=encoder.get):
+            text = bytearray([byte_decoder[c] for c in key])
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+
+    else:
+        for i in range(vocab_size):
+            text = tokenizer.decode([i]).encode("utf-8")
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
 
     list_vars = model.state_dict()
     for name in list_vars.keys():
-        src = name
-        nn = name
-        if name != "lm_head.weight":
-            nn = nn.split(".")[1:]
-        else:
-            nn = nn.split(".")
-
-        if nn[0] == "h":
-            nn[0] = "layers"
-            mapped = conv_map[".".join(nn[2:-1])]
-            name = ".".join(nn[:2] + [mapped] + nn[-1:])
-        else:
-            mapped = conv_map[".".join(nn[:-1])]
-            name = ".".join([mapped] + nn[-1:])
-
-        if "query_key_value" in src:
-            q, k, v = list_vars[src].reshape(config.n_head, 3, -1).unbind(1)
-            list_vars[src] = torch.cat([q, k, v], dim=0).reshape_as(list_vars[src])
-
-        print(src, " -> ", name)
-        data = list_vars[src].squeeze().numpy()
-        data = data.astype(np.float32)
-
-        n_dims = len(data.shape)
-        print(name, n_dims, data.shape)
-
-        # default type is fp32
-        ftype_cur = 0
-        if ftype == 1 and n_dims > 1:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
+        print("Processing variable: " + name)
+
+        if hasattr(ggml_config, "get_cpp_name"):
+            cpp_name = ggml_config.get_cpp_name(name=name)
+
+        if hasattr(ggml_config, "should_skip") and ggml_config.should_skip(name=cpp_name):
+            continue
+
+        if hasattr(ggml_config, "reshape_weights"):
+            list_vars[name] = ggml_config.reshape_weights(name=cpp_name, weights=list_vars[name], hparams=hparams)
+
+        n_dims = len(list_vars[name].shape)
+        data, ftype_cur = ggml_config.convert_dtype(name=cpp_name, data=list_vars[name], ftype=ftype, n_dims=n_dims)
+
+        if data.nbytes % ggml_config.GGML_MEM_ALIGN != 0:
+            description = f"Expected data (weights of {name}) to have a multiple of f{ggml_config.GGML_MEM_ALIGN} bytes, but data has {data.nbytes} bytes. Skipping to avoid memory alignment issues."
+            print(f"  {description}")
+            logger.warning(description)
+            continue
 
         # header
-        str = name.encode("utf-8")
+        str = cpp_name.encode("utf-8")
         fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
         for i in range(n_dims):
             fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
@@ -161,6 +159,7 @@ def main_export(
 
     fout.close()
 
+    print("Done. Output file: " + fname_out)
     if return_source_model:
         return model
 

diff --git a/optimum/exporters/ggml/base.py b/optimum/exporters/ggml/base.py
@@ -14,7 +14,11 @@
 # limitations under the License.
 """ggml configuration base classes."""
 
-from abc import ABC
+from abc import ABC, abstractmethod
+from typing import Dict, Union
+
+from numpy import ndarray
+from torch import Tensor
 
 from ..base import ExportConfig
 
@@ -24,24 +28,32 @@ class GgmlConfig(ExportConfig, ABC):
     Base class for GGML exportable model.
     """
 
-    def __init__(self, config: "PretrainedConfig", task: str = "feature-extraction"):
+    STRUCT_HPARAM_KEYS = []
+    USE_BYTE_DECODER = True  # TODO this should eventually be always True
+    GGML_MEM_ALIGN = 16
+
+    def __init__(self, config: "PretrainedConfig", task: str = "text-generation"):
         self.task = task
         self._config = config
 
+    @abstractmethod
+    def get_cpp_name(self, name: str) -> str:
+        raise NotImplementedError
+
+    def should_skip(self, name: str) -> bool:
+        return False
+
+    @abstractmethod
+    def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
+        return data, ftype
+
 
 class GgmlConfigWithPast(GgmlConfig, ABC):
     @classmethod
-    def with_past(cls, config: "PretrainedConfig", task: str = "feature-extraction") -> "OnnxConfigWithPast":
-        """
-        Instantiates a [`~optimum.exporters.onnx.OnnxConfig`] with `use_past` attribute set to `True`.
-
-        Args:
-            config (`transformers.PretrainedConfig`):
-                The underlying model's config to use when exporting to ONNX.
-            task (`str`, defaults to `"feature-extraction"`):
-                The task the model should be exported for.
-
-        Returns:
-            [`~optimum.exporters.onnx.GgmlConfig`]: The ggml config with `.use_past = True`
-        """
+    def with_past(cls, config: "PretrainedConfig", task: str = "text-generation") -> "OnnxConfigWithPast":
         return cls(config, task=task, use_past=True)
diff --git a/optimum/exporters/ggml/model_configs.py b/optimum/exporters/ggml/model_configs.py
@@ -17,6 +17,14 @@
 configurations.
 """
 
+import re
+from typing import Dict, Union
+
+import numpy as np
+import torch
+from numpy import ndarray
+from torch import Tensor
+
 from ...utils import DummyTextInputGenerator, logging
 from .base import GgmlConfigWithPast
 
@@ -35,15 +43,179 @@ class TextDecoderGGMLConfig(GgmlConfigWithPast):
 
 # Original code: https://github.com/NouamaneTazi/bloomz.cpp/blob/main/convert-hf-to-ggml.py
 class BloomGgmlConfig(TextDecoderGGMLConfig):
-    CONV_MAP = {
-        "word_embeddings": "tok_embeddings",
-        "word_embeddings_layernorm": "norm",
-        "input_layernorm": "attention_norm",
-        "self_attention.query_key_value": "attention.query_key_value",
-        "self_attention.dense": "attention.wo",
-        "post_attention_layernorm": "ffn_norm",
-        "mlp.dense_h_to_4h": "feed_forward.w1",
-        "mlp.dense_4h_to_h": "feed_forward.w2",
-        "ln_f": "output_norm",
-        "lm_head": "output",
-    }
+    STRUCT_HPARAM_KEYS = [
+        "n_positions",
+        "hidden_size",
+        "multiple_of",
+        "n_head",
+        "n_layer",
+    ]
+    USE_BYTE_DECODER = False
+
+    def get_cpp_name(self, name: str) -> str:
+        conv_map = {
+            "word_embeddings": "tok_embeddings",
+            "word_embeddings_layernorm": "norm",
+            "input_layernorm": "attention_norm",
+            "self_attention.query_key_value": "attention.query_key_value",
+            "self_attention.dense": "attention.wo",
+            "post_attention_layernorm": "ffn_norm",
+            "mlp.dense_h_to_4h": "feed_forward.w1",
+            "mlp.dense_4h_to_h": "feed_forward.w2",
+            "ln_f": "output_norm",
+            "lm_head": "output",
+        }
+        if name != "lm_head.weight":
+            nn = name.split(".")[1:]
+        else:
+            nn = name.split(".")
+        if nn[0] == "h":
+            nn[0] = "layers"
+            mapped = conv_map[".".join(nn[2:-1])]
+            name = ".".join(nn[:2] + [mapped] + nn[-1:])
+        else:
+            mapped = conv_map[".".join(nn[:-1])]
+            name = ".".join([mapped] + nn[-1:])
+        return name
+
+    def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
+        if "query_key_value" in name:
+            q, k, v = weights.reshape(hparams["n_head"], 3, -1).unbind(1)
+            return torch.cat([q, k, v], dim=0).reshape_as(weights)
+        return weights.squeeze().numpy()
+
+    @staticmethod
+    def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
+        # default type is fp32
+        if isinstance(data, Tensor):
+            data = data.numpy()
+        ftype_cur = 0
+        data = data.astype(np.float32)
+        if ftype == 1 and n_dims > 1:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        return data, ftype_cur
+
+
+class GPTBigCodeGgmlConfig(TextDecoderGGMLConfig):
+    STRUCT_HPARAM_KEYS = [
+        "n_positions",
+        "n_embd",
+        "n_inner",
+        "n_head",
+        "n_layer",
+    ]
+
+    def get_cpp_name(self, name: str) -> str:
+        if name == "transformer.ln_f.weight":
+            name = "model/ln_f/g"
+        elif name == "transformer.ln_f.bias":
+            name = "model/ln_f/b"
+        elif name == "transformer.wte.weight":
+            name = "model/wte"
+        elif name == "transformer.wpe.weight":
+            name = "model/wpe"
+        elif name == "lm_head.weight":
+            name = "model/lm_head"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/g"
+        elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_1/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/w"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_attn/b"
+        elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/w"
+        elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/attn/c_proj/b"
+        elif re.match(r"transformer.h.\d+.ln_2.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/g"
+        elif re.match(r"transformer.h.\d+.ln_2.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/ln_2/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_fc/b"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/w"
+        elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
+            i = re.findall("\d+", name)[0]
+            name = f"model/h{i}/mlp/c_proj/b"
+        else:
+            print("Unrecognized variable name. %s", name)
+        return name
+
+    "model/h.*/attn/c_attn/w"
+    "model/h.*/attn/c_proj/w"
+    "model/h.*/mlp/c_fc/w"
+    "model/h.*/mlp/c_proj/w"
+
+    def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
+        weights = weights.squeeze().numpy()
+        name_suffixes = {
+            "/attn/c_attn/w",
+            "/attn/c_attn/weight",
+            "/attn/c_attn/b",
+            "/attn/c_attn/bias",
+        }
+
+        if any(name.endswith(suffix) for suffix in name_suffixes):
+            print("  Duplicate K,V heads to use MHA instead of MQA")
+
+            embed_dim = hparams["n_embd"]
+            head_dim = embed_dim // hparams["n_head"]
+
+            # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            q, k, v = np.split(weights, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
+            # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+            if len(k.shape) == 2:
+                k = np.tile(k, (hparams["n_head"], 1))
+                v = np.tile(v, (hparams["n_head"], 1))
+            elif len(k.shape) == 1:
+                k = np.tile(k, (hparams["n_head"]))
+                v = np.tile(v, (hparams["n_head"]))
+            # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
+            weights = np.concatenate((q, k, v), axis=0)
+        return weights
+
+    def should_skip(self, name: str) -> bool:
+        return name.endswith("attn.masked_bias") or name.endswith(".attn.bias")
+
+    @staticmethod
+    def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
+        return data.astype(np.float32), ftype  # TODO fix the fp16 option
+
+        if ftype == 0:
+            name_suffixes = {
+                "/g",
+                "/g",
+                "/w",
+                "/weight",
+            }
+
+            if (
+                name == "model/wte"
+                or name == "model/lm_head"
+                or any(name.endswith(suffix) for suffix in name_suffixes)
+            ) and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype = 0
+        return data, ftype