huggingface · baskrahmer · Aug 10, 2023 · Aug 15, 2023 · Aug 16, 2023 · Aug 17, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "optimum/exporters/ggml/src/ggml.cpp"]
+	path = optimum/exporters/ggml/src/ggml.cpp
+	url = https://github.com/ggerganov/ggml
diff --git a/optimum/commands/export/ggml.py b/optimum/commands/export/ggml.py
@@ -0,0 +1,98 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines the command line for the export with ggml."""
+
+import subprocess
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+
+from ...exporters import TasksManager
+from ..base import BaseOptimumCLICommand
+
+
+if TYPE_CHECKING:
+    from argparse import ArgumentParser, Namespace, _SubParsersAction
+
+    from ..base import CommandInfo
+
+
+def parse_args_ggml(parser: "ArgumentParser"):
+    required_group = parser.add_argument_group("Required arguments")
+    required_group.add_argument(
+        "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
+    )
+    required_group.add_argument(
+        "output", type=Path, help="Path indicating the directory where to store generated ggml model."
+    )
+
+    optional_group = parser.add_argument_group("Optional arguments")
+    optional_group.add_argument(
+        "--task",
+        default="auto",
+        help=(
+            "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
+            f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
+        ),
+    )
+    optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Allow to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.",
+    )
+
+    input_group = parser.add_argument_group("Input shapes")
+    doc_input = "that the ggml exported model will be able to take as input."
+    input_group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help=f"Batch size {doc_input}",
+    )
+    input_group.add_argument(
+        "--sequence_length",
+        type=int,
+        default=None,
+        help=f"Sequence length {doc_input}",
+    )
+    input_group.add_argument(
+        "--num_choices",
+        type=int,
+        default=None,
+        help=f"Only for the multiple-choice task. Num choices {doc_input}",
+    )
+    # TODO add quantization args
+
+
+class GgmlExportCommand(BaseOptimumCLICommand):
+    def __init__(
+        self,
+        subparsers: Optional["_SubParsersAction"],
+        args: Optional["Namespace"] = None,
+        command: Optional["CommandInfo"] = None,
+        from_defaults_factory: bool = False,
+        parser: Optional["ArgumentParser"] = None,
+    ):
+        super().__init__(subparsers, args, command=command, from_defaults_factory=from_defaults_factory, parser=parser)
+        # TODO: hack until GgmlExportCommand does not use subprocess anymore.
+        self.args_string = " ".join(sys.argv[3:])
+
+    @staticmethod
+    def parse_args(parser: "ArgumentParser"):
+        return parse_args_ggml(parser)
+
+    def run(self):
+        full_command = f"python3 -m optimum.exporters.ggml {self.args_string}"
+        subprocess.run(full_command, shell=True, check=True)
diff --git a/optimum/exporters/ggml/.gitignore b/optimum/exporters/ggml/.gitignore
@@ -0,0 +1,7 @@
+*.o
+*.a
+
+models/*
+
+/main
+/quantize
diff --git a/optimum/exporters/ggml/__init__.py b/optimum/exporters/ggml/__init__.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "base": [],
+    "convert": [],
+}
+
+if TYPE_CHECKING:
+    from .base import QuantizationApproach, TFLiteQuantizationConfig, GgmlConfig  # noqa
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/optimum/exporters/ggml/__main__.py b/optimum/exporters/ggml/__main__.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Entry point to the optimum.exporters.ggml command line."""
+
+import os
+import struct
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from optimum.commands.export.ggml import parse_args_ggml
+from optimum.exporters.ggml.utils import bytes_to_unicode, infer_task
+from optimum.exporters.tasks import TasksManager
+from optimum.utils import logging
+
+
+logger = logging.get_logger()
+logger.setLevel(logging.INFO)
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, TFPreTrainedModel
+
+
+def _get_submodels_and_ggml_configs(
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    task: str,
+):
+    ggml_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="ggml", task=task)
+    ggml_config = ggml_config_constructor(model.config)
+
+    return ggml_config
+
+
+def main_export(
+    model_name_or_path: str,
+    output: Union[str, Path],
+    task: str = "auto",
+    fp16: Optional[bool] = False,
+    cache_dir: Optional[str] = None,
+    trust_remote_code: bool = False,
+    return_source_model: bool = False,
+) -> Union["PreTrainedModel", "TFPreTrainedModel", None]:
+    """
+    Full-suite ggml export.
+    """
+
+    output = Path(output)
+
+    if not output.parent.exists():
+        output.parent.mkdir(parents=True)
+
+    # Infer the task
+    task = infer_task(model_name_or_path, task)
+
+    # make sure the output directory exists
+    os.makedirs(output, exist_ok=True)
+
+    # possible data types
+    #   ftype == 0 -> float32
+    #   ftype == 1 -> float16
+    #
+    # map from ftype to string
+    ftype_str = ["f32", "f16"]
+    ftype = int(fp16)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    hparams = config.to_dict()
+
+    model = TasksManager.get_model_from_task(
+        task,
+        model_name_or_path,
+        cache_dir=cache_dir,
+        trust_remote_code=trust_remote_code,
+        config=config,
+        torch_dtype=torch.float16 if ftype == 1 else torch.float32,
+    )
+
+    ggml_config = _get_submodels_and_ggml_configs(
+        model=model,
+        task=task,
+    )
+
+    fname_out = os.path.join(output, f"ggml-model-{model_name_or_path.split('/')[-1]}-{ftype_str[ftype]}.bin")
+    fout = open(fname_out, "wb")
+
+    # Hardcoded for Bloom TODO remove as argument in cpp and hardcode there so hparam can be removed
+    hparams["multiple_of"] = 1
+
+    vocab_size = hparams["vocab_size"]
+
+    fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
+    fout.write(struct.pack("i", vocab_size))
+    for key in ggml_config.STRUCT_HPARAM_KEYS:
+        fout.write(struct.pack("i", hparams[key]))
+    fout.write(struct.pack("i", ftype))
+
+    if ggml_config.USE_BYTE_DECODER:
+        byte_encoder = bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        encoder = tokenizer.vocab
+
+        fout.write(struct.pack("i", vocab_size))
+
+        for key in sorted(encoder, key=encoder.get):
+            text = bytearray([byte_decoder[c] for c in key])
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+
+    else:
+        for i in range(vocab_size):
+            text = tokenizer.decode([i]).encode("utf-8")
+            fout.write(struct.pack("i", len(text)))
+            fout.write(text)
+
+    list_vars = model.state_dict()
+    for name in list_vars.keys():
+        print("Processing variable: " + name)
+
+        if hasattr(ggml_config, "get_cpp_name"):
+            cpp_name = ggml_config.get_cpp_name(name=name)
+
+        if hasattr(ggml_config, "should_skip") and ggml_config.should_skip(name=cpp_name):
+            continue
+
+        if hasattr(ggml_config, "reshape_weights"):
+            list_vars[name] = ggml_config.reshape_weights(name=cpp_name, weights=list_vars[name], hparams=hparams)
+
+        n_dims = len(list_vars[name].shape)
+        data, ftype_cur = ggml_config.convert_dtype(name=cpp_name, data=list_vars[name], ftype=ftype, n_dims=n_dims)
+
+        if data.nbytes % ggml_config.GGML_MEM_ALIGN != 0:
+            description = f"Expected data (weights of {name}) to have a multiple of f{ggml_config.GGML_MEM_ALIGN} bytes, but data has {data.nbytes} bytes. Skipping to avoid memory alignment issues."
+            print(f"  {description}")
+            logger.warning(description)
+            continue
+
+        # header
+        str = cpp_name.encode("utf-8")
+        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+        fout.write(str)
+
+        # data
+        data.tofile(fout)
+
+    fout.close()
+
+    print("Done. Output file: " + fname_out)
+    if return_source_model:
+        return model
+
+
+def main():
+    parser = ArgumentParser("Hugging Face Optimum ggml exporter")
+
+    parse_args_ggml(parser)
+
+    # Retrieve CLI arguments
+    args = parser.parse_args()
+
+    main_export(
+        model_name_or_path=args.model,
+        output=args.output,
+        cache_dir=args.cache_dir,
+        task=args.task,
+    )
+
+
+if __name__ == "__main__":
+    main()