Skip to content

Commit

Permalink
Add starcoder
Browse files Browse the repository at this point in the history
  • Loading branch information
bk-jc committed Aug 17, 2023
1 parent 8b254e9 commit c344cec
Show file tree
Hide file tree
Showing 8 changed files with 1,224 additions and 152 deletions.
93 changes: 46 additions & 47 deletions optimum/exporters/ggml/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@
from pathlib import Path
from typing import Optional, Union

import numpy as np
import torch
from transformers import AutoConfig, AutoTokenizer

from optimum.commands.export.ggml import parse_args_ggml
from optimum.exporters.ggml.utils import infer_task
from optimum.exporters.ggml.utils import bytes_to_unicode, infer_task
from optimum.exporters.tasks import TasksManager
from optimum.utils import logging

Expand Down Expand Up @@ -94,63 +93,62 @@ def main_export(
task=task,
)

conv_map = ggml_config.CONV_MAP

fname_out = os.path.join(output, f"ggml-model-{model_name_or_path.split('/')[-1]}-{ftype_str[ftype]}.bin")
fout = open(fname_out, "wb")

# Hardcoded for Bloom TODO remove as argument in cpp and hardcode there so hparam can be removed
hparams["multiple_of"] = 1

vocab_size = hparams["vocab_size"]

fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["n_positions"]))
fout.write(struct.pack("i", hparams["hidden_size"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", vocab_size))
for key in ggml_config.STRUCT_HPARAM_KEYS:
fout.write(struct.pack("i", hparams[key]))
fout.write(struct.pack("i", ftype))

for i in range(hparams["vocab_size"]):
text = tokenizer.decode([i]).encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
if ggml_config.USE_BYTE_DECODER:
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
encoder = tokenizer.vocab

fout.write(struct.pack("i", vocab_size))

for key in sorted(encoder, key=encoder.get):
text = bytearray([byte_decoder[c] for c in key])
fout.write(struct.pack("i", len(text)))
fout.write(text)

else:
for i in range(vocab_size):
text = tokenizer.decode([i]).encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)

list_vars = model.state_dict()
for name in list_vars.keys():
src = name
nn = name
if name != "lm_head.weight":
nn = nn.split(".")[1:]
else:
nn = nn.split(".")

if nn[0] == "h":
nn[0] = "layers"
mapped = conv_map[".".join(nn[2:-1])]
name = ".".join(nn[:2] + [mapped] + nn[-1:])
else:
mapped = conv_map[".".join(nn[:-1])]
name = ".".join([mapped] + nn[-1:])

if "query_key_value" in src:
q, k, v = list_vars[src].reshape(config.n_head, 3, -1).unbind(1)
list_vars[src] = torch.cat([q, k, v], dim=0).reshape_as(list_vars[src])

print(src, " -> ", name)
data = list_vars[src].squeeze().numpy()
data = data.astype(np.float32)

n_dims = len(data.shape)
print(name, n_dims, data.shape)

# default type is fp32
ftype_cur = 0
if ftype == 1 and n_dims > 1:
print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
print("Processing variable: " + name)

if hasattr(ggml_config, "get_cpp_name"):
cpp_name = ggml_config.get_cpp_name(name=name)

if hasattr(ggml_config, "should_skip") and ggml_config.should_skip(name=cpp_name):
continue

if hasattr(ggml_config, "reshape_weights"):
list_vars[name] = ggml_config.reshape_weights(name=cpp_name, weights=list_vars[name], hparams=hparams)

n_dims = len(list_vars[name].shape)
data, ftype_cur = ggml_config.convert_dtype(name=cpp_name, data=list_vars[name], ftype=ftype, n_dims=n_dims)

if data.nbytes % ggml_config.GGML_MEM_ALIGN != 0:
description = f"Expected data (weights of {name}) to have a multiple of f{ggml_config.GGML_MEM_ALIGN} bytes, but data has {data.nbytes} bytes. Skipping to avoid memory alignment issues."
print(f" {description}")
logger.warning(description)
continue

# header
str = name.encode("utf-8")
str = cpp_name.encode("utf-8")
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
Expand All @@ -161,6 +159,7 @@ def main_export(

fout.close()

print("Done. Output file: " + fname_out)
if return_source_model:
return model

Expand Down
42 changes: 27 additions & 15 deletions optimum/exporters/ggml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
# limitations under the License.
"""ggml configuration base classes."""

from abc import ABC
from abc import ABC, abstractmethod
from typing import Dict, Union

from numpy import ndarray
from torch import Tensor

from ..base import ExportConfig

Expand All @@ -24,24 +28,32 @@ class GgmlConfig(ExportConfig, ABC):
Base class for GGML exportable model.
"""

def __init__(self, config: "PretrainedConfig", task: str = "feature-extraction"):
STRUCT_HPARAM_KEYS = []
USE_BYTE_DECODER = True # TODO this should eventually be always True
GGML_MEM_ALIGN = 16

def __init__(self, config: "PretrainedConfig", task: str = "text-generation"):
self.task = task
self._config = config

@abstractmethod
def get_cpp_name(self, name: str) -> str:
raise NotImplementedError

def should_skip(self, name: str) -> bool:
return False

@abstractmethod
def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
raise NotImplementedError

@staticmethod
@abstractmethod
def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
return data, ftype


class GgmlConfigWithPast(GgmlConfig, ABC):
@classmethod
def with_past(cls, config: "PretrainedConfig", task: str = "feature-extraction") -> "OnnxConfigWithPast":
"""
Instantiates a [`~optimum.exporters.onnx.OnnxConfig`] with `use_past` attribute set to `True`.
Args:
config (`transformers.PretrainedConfig`):
The underlying model's config to use when exporting to ONNX.
task (`str`, defaults to `"feature-extraction"`):
The task the model should be exported for.
Returns:
[`~optimum.exporters.onnx.GgmlConfig`]: The ggml config with `.use_past = True`
"""
def with_past(cls, config: "PretrainedConfig", task: str = "text-generation") -> "OnnxConfigWithPast":
return cls(config, task=task, use_past=True)
196 changes: 184 additions & 12 deletions optimum/exporters/ggml/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
configurations.
"""

import re
from typing import Dict, Union

import numpy as np
import torch
from numpy import ndarray
from torch import Tensor

from ...utils import DummyTextInputGenerator, logging
from .base import GgmlConfigWithPast

Expand All @@ -35,15 +43,179 @@ class TextDecoderGGMLConfig(GgmlConfigWithPast):

# Original code: https://github.com/NouamaneTazi/bloomz.cpp/blob/main/convert-hf-to-ggml.py
class BloomGgmlConfig(TextDecoderGGMLConfig):
CONV_MAP = {
"word_embeddings": "tok_embeddings",
"word_embeddings_layernorm": "norm",
"input_layernorm": "attention_norm",
"self_attention.query_key_value": "attention.query_key_value",
"self_attention.dense": "attention.wo",
"post_attention_layernorm": "ffn_norm",
"mlp.dense_h_to_4h": "feed_forward.w1",
"mlp.dense_4h_to_h": "feed_forward.w2",
"ln_f": "output_norm",
"lm_head": "output",
}
STRUCT_HPARAM_KEYS = [
"n_positions",
"hidden_size",
"multiple_of",
"n_head",
"n_layer",
]
USE_BYTE_DECODER = False

def get_cpp_name(self, name: str) -> str:
conv_map = {
"word_embeddings": "tok_embeddings",
"word_embeddings_layernorm": "norm",
"input_layernorm": "attention_norm",
"self_attention.query_key_value": "attention.query_key_value",
"self_attention.dense": "attention.wo",
"post_attention_layernorm": "ffn_norm",
"mlp.dense_h_to_4h": "feed_forward.w1",
"mlp.dense_4h_to_h": "feed_forward.w2",
"ln_f": "output_norm",
"lm_head": "output",
}
if name != "lm_head.weight":
nn = name.split(".")[1:]
else:
nn = name.split(".")
if nn[0] == "h":
nn[0] = "layers"
mapped = conv_map[".".join(nn[2:-1])]
name = ".".join(nn[:2] + [mapped] + nn[-1:])
else:
mapped = conv_map[".".join(nn[:-1])]
name = ".".join([mapped] + nn[-1:])
return name

def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
if "query_key_value" in name:
q, k, v = weights.reshape(hparams["n_head"], 3, -1).unbind(1)
return torch.cat([q, k, v], dim=0).reshape_as(weights)
return weights.squeeze().numpy()

@staticmethod
def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
# default type is fp32
if isinstance(data, Tensor):
data = data.numpy()
ftype_cur = 0
data = data.astype(np.float32)
if ftype == 1 and n_dims > 1:
print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
return data, ftype_cur


class GPTBigCodeGgmlConfig(TextDecoderGGMLConfig):
STRUCT_HPARAM_KEYS = [
"n_positions",
"n_embd",
"n_inner",
"n_head",
"n_layer",
]

def get_cpp_name(self, name: str) -> str:
if name == "transformer.ln_f.weight":
name = "model/ln_f/g"
elif name == "transformer.ln_f.bias":
name = "model/ln_f/b"
elif name == "transformer.wte.weight":
name = "model/wte"
elif name == "transformer.wpe.weight":
name = "model/wpe"
elif name == "lm_head.weight":
name = "model/lm_head"
elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_1/g"
elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_1/b"
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_attn/w"
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_attn/b"
elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_proj/w"
elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/attn/c_proj/b"
elif re.match(r"transformer.h.\d+.ln_2.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_2/g"
elif re.match(r"transformer.h.\d+.ln_2.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/ln_2/b"
elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_fc/w"
elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_fc/b"
elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_proj/w"
elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
i = re.findall("\d+", name)[0]
name = f"model/h{i}/mlp/c_proj/b"
else:
print("Unrecognized variable name. %s", name)
return name

"model/h.*/attn/c_attn/w"
"model/h.*/attn/c_proj/w"
"model/h.*/mlp/c_fc/w"
"model/h.*/mlp/c_proj/w"

def reshape_weights(self, name: str, weights: Union[ndarray, Tensor], hparams: Dict) -> ndarray:
weights = weights.squeeze().numpy()
name_suffixes = {
"/attn/c_attn/w",
"/attn/c_attn/weight",
"/attn/c_attn/b",
"/attn/c_attn/bias",
}

if any(name.endswith(suffix) for suffix in name_suffixes):
print(" Duplicate K,V heads to use MHA instead of MQA")

embed_dim = hparams["n_embd"]
head_dim = embed_dim // hparams["n_head"]

# ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
q, k, v = np.split(weights, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
if len(k.shape) == 2:
k = np.tile(k, (hparams["n_head"], 1))
v = np.tile(v, (hparams["n_head"], 1))
elif len(k.shape) == 1:
k = np.tile(k, (hparams["n_head"]))
v = np.tile(v, (hparams["n_head"]))
# concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
weights = np.concatenate((q, k, v), axis=0)
return weights

def should_skip(self, name: str) -> bool:
return name.endswith("attn.masked_bias") or name.endswith(".attn.bias")

@staticmethod
def convert_dtype(name: str, data: Union[ndarray, Tensor], ftype: int, n_dims: int) -> tuple[ndarray, int]:
return data.astype(np.float32), ftype # TODO fix the fp16 option

if ftype == 0:
name_suffixes = {
"/g",
"/g",
"/w",
"/weight",
}

if (
name == "model/wte"
or name == "model/lm_head"
or any(name.endswith(suffix) for suffix in name_suffixes)
) and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
ftype = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0
return data, ftype
Loading

0 comments on commit c344cec

Please sign in to comment.