Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ability to load local models, added early stopping, remove vocab check, fixed GPTJ model conversion #38

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ This project aims to address the third using LLaMa.cpp and GGML.

- Inference Speed! Focus on inference, not training.
- Precompressed models.
- Minimal setup required - soon `pip install cformers` should be good to get started.
- Minimal setup required - `pip install cformers` should be good to get started.
- Easily switch between models and quantization types.
- Support variety of prompts.

Expand All @@ -26,14 +26,12 @@ And most importantly:

Setup
```bash
pip install transformers wget
git clone https://github.com/nolanoOrg/cformers.git
cd cformers/cformers/cpp && make && cd ..
pip install cformers
```

Usage:
```python
from interface import AutoInference as AI
from cformers import AutoInference as AI
ai = AI('EleutherAI/gpt-j-6B')
x = ai.generate('def parse_html(html_doc):', num_tokens_to_generate=500)
print(x['token_str'])
Expand All @@ -58,8 +56,6 @@ chat.py accepts the following parameteres:
- ```-p Tell me a joke``` for a single prompt interaction
- ```-m pythia``` to load one of the available (bloom, pythia or gptj )

We are working on adding support for `pip install cformers.`

Following Architectures are supported:
- GPT-J
- BLOOM
Expand Down
2 changes: 1 addition & 1 deletion cformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Cformers: SoTA Transformer inference on your CPU."""
from .interface import AutoModel, AutoTokenizer
from .interface import AutoInference
16 changes: 11 additions & 5 deletions cformers/cpp/converters/convert_gptj_to_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,19 @@ def bytes_to_unicode():
model_card = sys.argv[1]
fname_out = sys.argv[2] + "/ggml-gptj-6b-model.bin"

if "CONVERTER_CACHE_DIR" in os.environ:
dir_cache = os.environ["CONVERTER_CACHE_DIR"] + model_card.replace('/', '-.-')
if not os.path.exists(os.path.realpath(model_card)):

if "CONVERTER_CACHE_DIR" in os.environ:
dir_cache = os.environ["CONVERTER_CACHE_DIR"] + model_card.replace('/', '-.-')
else:
dir_cache = "~/.cformers_converters" + model_card.replace('/', '-.-')

if not os.path.exists(dir_cache):
os.makedirs(dir_cache)

else:
dir_cache = "~/.cformers_converters" + model_card.replace('/', '-.-')
dir_cache = os.path.realpath(model_card)

if not os.path.exists(dir_cache):
os.makedirs(dir_cache)

# Fetch vocab.json from https://huggingface.co/<model_card>/resolve/main/vocab.json if not found in dir_cache/vocab.json
if not os.path.exists(dir_cache + "/vocab.json"):
Expand Down
10 changes: 5 additions & 5 deletions cformers/cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1058,11 +1058,11 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
int32_t n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));

if (n_vocab != model.hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
return false;
}
// if (n_vocab != model.hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
// return false;
// }

std::string word;
for (int i = 0; i < n_vocab; i++) {
Expand Down
10 changes: 5 additions & 5 deletions cformers/cpp/quantize_gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,11 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read ((char *) &n_vocab, sizeof(n_vocab));
fout.write((char *) &n_vocab, sizeof(n_vocab));

if (n_vocab != hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
return false;
}
// if (n_vocab != hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
// return false;
// }

std::string word;
for (int i = 0; i < n_vocab; i++) {
Expand Down
37 changes: 29 additions & 8 deletions cformers/interface.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
"""Call's the C++ code from Python."""
from subprocess import Popen, PIPE
import subprocess
import hashlib
import re
import os
import sys
import select
import wget
import requests
import pathlib
import time

import transformers as tf # RIP TensorFlow

Expand Down Expand Up @@ -144,17 +147,21 @@ def get_modes(self):

class AutoInference:
"""A wrapper for the C++ model."""
def __init__(self, model_name, hash_sum="", mode="int4_fixed_zero"):
def __init__(self, model_name, hash_sum="", mode="int4_fixed_zero",from_pretrained=""):
self.model_name = model_name
self.mode = mode
self.hash_sum = hash_sum
self.cpp_model_name = MAP_MODEL_TO_URL[model_name].cpp_model_name
self.model_url = MAP_MODEL_TO_URL[model_name].get_url(mode)
self.model_save_path = os.path.join(CFORMERS_CACHE_PATH, "models", model_name, mode)
self.tokenizer = tf.AutoTokenizer.from_pretrained(model_name)
if from_pretrained != "":
self.model_save_path = os.path.realpath(from_pretrained)
self.tokenizer = tf.AutoTokenizer.from_pretrained(os.path.dirname(from_pretrained))
else:
self.model_url = MAP_MODEL_TO_URL[model_name].get_url(mode)
self.model_save_path = os.path.join(CFORMERS_CACHE_PATH, "models", model_name, mode)
self.tokenizer = tf.AutoTokenizer.from_pretrained(model_name)

# Download the model if it doesn't exist
if not os.path.exists(self.model_save_path):
if not os.path.exists(self.model_save_path) and self.from_pretrained == "":
# Create the directory if it doesn't exist
parent_dir = os.path.dirname(self.model_save_path)
if not os.path.exists(parent_dir):
Expand Down Expand Up @@ -183,7 +190,9 @@ def generate(self,
seed=42,
streaming_token_str_hook=lambda x: x,
streaming_token_ids_hook=lambda x: x,
print_streaming_output=True):
print_streaming_output=True,
end_token=None,
wait_for_process=False):
"""Generates text from the given prompt.

streaming_output_hook: function to be called after every token is generated.
Expand All @@ -197,11 +206,13 @@ def generate(self,
f"Prompt should be a list of integers {prompt}"
# Convert to a string of space separated integers
prompt = " ".join([str(x) for x in prompt])

main_file = str(pathlib.Path(__file__).parent.resolve())

if os.name == 'nt':
main_file = "./cpp/main.exe"
main_file += "/cpp/main.exe"
else:
main_file = "./cpp/main"
main_file += "/cpp/main"

command = [main_file, self.cpp_model_name,
"-m", self.model_save_path,
Expand Down Expand Up @@ -237,6 +248,9 @@ def generate(self,
streaming_token_str_hook(token_str)
streaming_token_ids_hook(token_id)
to_print = token_str
if token_str == end_token:
all_stdout_so_far += "<END|>"
break
else:
token_id_buffer += c.decode('utf-8')

Expand All @@ -263,12 +277,19 @@ def generate(self,
# return all_stdout_so_far
token_line = re.findall(r'<\|BEGIN\>(.*?)<END\|>', all_stdout_so_far, re.DOTALL)[0]

print(token_line)

# Convert the token_line to a list of integers
all_tokens = [int(x) for x in token_line.split()]

# Decode the tokens
decoded_tokens = self.tokenizer.decode(all_tokens)

if not wait_for_process:
return {"success": True,
"token_ids": all_tokens,
"token_str": decoded_tokens}

# Get the exit code
success = process.wait()
# Kill the child process if it's still running
Expand Down
40 changes: 40 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from setuptools import setup, find_packages
import codecs
import os
import subprocess

packages= ['cformers', 'cformers/cpp']
package_data = {'cformers': ['*'], 'cformers/cpp': ['*']}
build_main = subprocess.run(["make"], stdout=subprocess.PIPE, cwd="cformers/cpp")

here = os.path.abspath(os.path.dirname(__file__))

with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh:
long_description = "\n" + fh.read()

VERSION = '0.0.4'
DESCRIPTION = 'SoTA Transformers with C-backend for fast inference on your CPU.'
LONG_DESCRIPTION = 'We identify three pillers to enable fast inference of SoTA AI models on your CPU:\n1. Fast C/C++ LLM inference kernels for CPU.\n2. Machine Learning Research & Exploration front - Compression through quantization, sparsification, training on more data, collecting data and training instruction & chat models.\n3. Easy to use API for fast AI inference in dynamically typed language like Python.\n\nThis project aims to address the third using LLaMa.cpp and GGML.'

# Setting up
setup(
name="cformers",
version=VERSION,
author="Ayush Kaushal (Ayushk4)",
author_email="[email protected]",
description=DESCRIPTION,
long_description_content_type="text/markdown",
long_description=LONG_DESCRIPTION,
packages=packages,
package_data=package_data,
install_requires=['transformers', 'torch', 'wget'],
keywords=['python', 'local inference', 'c++ inference', 'language models', 'cpu inference', 'quantization'],
classifiers=[
"Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Developers",
"Programming Language :: Python :: 3",
"Operating System :: Unix",
"Operating System :: MacOS :: MacOS X",
"Operating System :: Microsoft :: Windows",
]
)