From d239353836c6de84a5d196988e48d2969e43af0f Mon Sep 17 00:00:00 2001 From: Umberto Griffo <1609440+umbertogriffo@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:26:02 +0100 Subject: [PATCH] refactor: get rid of ctransformers --- .github/workflows/ci.yaml | 5 - Makefile | 9 +- README.md | 25 +-- .../explore_ctransformers.py | 26 --- experiments/exp_ctransformers/model.py | 195 ------------------ experiments/exp_ctransformers/prompts.py | 33 --- todo.md | 1 - version/ctransformers | 1 - 8 files changed, 12 insertions(+), 283 deletions(-) delete mode 100644 experiments/exp_ctransformers/explore_ctransformers.py delete mode 100644 experiments/exp_ctransformers/model.py delete mode 100644 experiments/exp_ctransformers/prompts.py delete mode 100644 version/ctransformers diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8904dae..f2fabcd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,17 +40,12 @@ jobs: id: llama-cpp-version run: echo "llama-cpp-version=$(cat version/llama_cpp)" >> "$GITHUB_OUTPUT" - - name: Get ctransformers version - id: ctransformers-version - run: echo "ctransformers-version=$(cat version/ctransformers)" >> "$GITHUB_OUTPUT" - # Installing dependencies and llama-cpp-python without NVIDIA CUDA acceleration. - name: Setup environment run: | poetry lock --check poetry install --no-root --no-ansi . .venv/bin/activate && pip3 install llama-cpp-python==${{ steps.llama-cpp-version.outputs.llama-cpp-version }} - . .venv/bin/activate && pip3 install ctransformers==${{ steps.ctransformers-version.outputs.ctransformers-version }} - name: Run tests run: | diff --git a/Makefile b/Makefile index b3863a1..efc0dce 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,6 @@ llama_cpp_file=version/llama_cpp llama_cpp_version=`cat $(llama_cpp_file)` -ctransformers_file=version/ctransformers -ctransformers_version=`cat $(ctransformers_file)` - check: which pip3 which python3 @@ -15,18 +12,16 @@ install_cuda: mkdir -p .venv poetry config virtualenvs.in-project true poetry install --extras "cuda-acceleration" --no-root --no-ansi - echo "Installing llama-cpp-python and ctransformers with pip to get NVIDIA CUDA acceleration" + echo "Installing llama-cpp-python with pip to get NVIDIA CUDA acceleration" . .venv/bin/activate && CMAKE_ARGS="-DGGML_CUDA=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v - . .venv/bin/activate && pip3 install ctransformers[cuda]==$(ctransformers_version) install_metal: echo "Installing..." mkdir -p .venv poetry config virtualenvs.in-project true poetry install --no-root --no-ansi - echo "Installing llama-cpp-python and ctransformers with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)" + echo "Installing llama-cpp-python with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)" . .venv/bin/activate && CMAKE_ARGS="-DGGML_METAL=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v - . .venv/bin/activate && CT_METAL=1 pip install ctransformers==$(ctransformers_version) --no-binary ctransformers install_pre_commit: poetry run pre-commit install diff --git a/README.md b/README.md index bccf308..c68bdf6 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,8 @@ > * `MacOS Sonoma 14.3.1` running on a MacBook Pro M1 (2020). > > If you are using another Operating System or different hardware, and you can't load the models, please -> take a look either at the official Llama Cpp Python's +> take a look at the official Llama Cpp Python's > GitHub [issue](https://github.com/abetlen/llama-cpp-python/issues). -> or at the official CTransformers's GitHub [issue](https://github.com/marella/ctransformers/issues) > [!WARNING] > Note: it's important to note that the large language model sometimes generates hallucinations or false information. @@ -39,10 +38,8 @@ ## Introduction This project combines the power -of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [CTransformers](https://github.com/marella/ctransformers), -[LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and -querying the Vector Database, and we plan to eliminate it entirely), -[Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build: +of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and querying the Vector Database, and we plan to +eliminate it entirely), [Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build: * a Conversation-aware Chatbot (ChatGPT like experience). * a RAG (Retrieval-augmented generation) ChatBot. @@ -79,7 +76,7 @@ To deal with context overflows, we implemented three approaches: ## Prerequisites * Python 3.10+ -* GPU supporting CUDA 12.1, 12.2, 12.3, or 12.4 +* GPU supporting CUDA 12.1+ * Poetry 1.7.0 ### Install Poetry @@ -124,14 +121,13 @@ To easily install the dependencies we created a make file. ## Using the Open-Source Models Locally -We utilize two open-source libraries, [Lama.cpp](https://github.com/abetlen/llama-cpp-python) -and [CTransformers](https://github.com/marella/ctransformers), -which allow us to work efficiently with transformer-based models efficiently. -Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of -parameters. These libraries enable us to run them either on a `CPU` or `GPU`. +We utilize the open-source library [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a binding for [llama-cpp](https://github.com/ggerganov/llama.cpp), +allowing us to utilize it within a Python environment. +`llama-cpp` serves as a C++ backend designed to work efficiently with transformer-based models. +Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of parameters. +This library enable us to run them either on a `CPU` or `GPU`. Additionally, we use the `Quantization and 4-bit precision` to reduce number of bits required to represent the numbers. -The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c) -format. +The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c) format. ### Supported Models @@ -244,7 +240,6 @@ streamlit run chatbot/rag_chatbot_app.py -- --model openchat-3.6 --k 2 --synthes * Open Source Repositories: * [llama.cpp](https://github.com/ggerganov/llama.cpp) * [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - * [CTransformers](https://github.com/marella/ctransformers) * [GPT4All](https://github.com/nomic-ai/gpt4all) * [pyllamacpp](https://github.com/abdeladim-s/pyllamacpp) * [chroma](https://github.com/chroma-core/chroma) diff --git a/experiments/exp_ctransformers/explore_ctransformers.py b/experiments/exp_ctransformers/explore_ctransformers.py deleted file mode 100644 index 128209c..0000000 --- a/experiments/exp_ctransformers/explore_ctransformers.py +++ /dev/null @@ -1,26 +0,0 @@ -import time -from pathlib import Path - -from exp_ctransformers.model import Model, get_model_setting - -if __name__ == "__main__": - root_folder = Path(__file__).resolve().parent.parent.parent - model_folder = root_folder / "models" - Path(model_folder).parent.mkdir(parents=True, exist_ok=True) - - model_settings = get_model_setting("zephyr") - - llm = Model(model_folder, model_settings) - - # question_p = """What is the date for announcement""" - # context_p = """ On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt - # renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore.""" - - question_p = """Create a regex to extract dates from logs in Python""" - - prompt = llm.generate_prompt(question=question_p) - - start_time = time.time() - _ = llm.generate_output(prompt, max_new_tokens=1000) - took = time.time() - start_time - print(f"--- Took {took:.2f} seconds ---") diff --git a/experiments/exp_ctransformers/model.py b/experiments/exp_ctransformers/model.py deleted file mode 100644 index 73d321d..0000000 --- a/experiments/exp_ctransformers/model.py +++ /dev/null @@ -1,195 +0,0 @@ -import os -from abc import ABC -from pathlib import Path - -import requests -from ctransformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Config -from exp_ctransformers.prompts import generate_prompt -from tqdm import tqdm -from transformers import TextStreamer - - -class ModelSettings(ABC): - """ - top_k="The top-k value to use for sampling." - top_p="The top-p value to use for sampling." - temperature="The temperature to use for sampling." - repetition_penalty="The repetition penalty to use for sampling." - last_n_tokens="The number of last tokens to use for repetition penalty." - seed="The seed value to use for sampling tokens." - max_new_tokens="The maximum number of new tokens to generate." - stop="A list of sequences to stop generation when encountered." - stream="Whether to stream the generated text." - reset="Whether to reset the model state before generating text." - batch_size="The batch size to use for evaluating tokens in a single prompt." - threads="The number of threads to use for evaluating tokens." - context_length="The maximum context length to use." - gpu_layers="The number of layers to run on GPU." - Set gpu_layers to the number of layers to offload to GPU. - Set to 0 if no GPU acceleration is available on your system. - """ - - url: str - file_name: str - model_type: str - system_template: str - prompt_template: str - config: Config - - -class ZephyrSettings(ModelSettings): - url = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_K_M.gguf" - file_name = "zephyr-7b-beta.Q4_K_M.gguf" - model_type = "mistral" - config = Config( - top_k=40, - top_p=0.95, - temperature=0.8, - repetition_penalty=1.1, - last_n_tokens=64, - seed=-1, - batch_size=512, - threads=-1, - max_new_tokens=1024, - stop=None, - stream=False, - reset=True, - context_length=3048, - gpu_layers=50, - mmap=True, - mlock=False, - ) - system_template = "You are a helpful, respectful and honest assistant. " - prompt_template = """<|system|> {system} -Answer the question below: - -<|user|> -{question} -<|assistant|> -""" - - -class MistralSettings(ModelSettings): - url = "https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q4_K_M.gguf" - file_name = "mistral-7b-openorca.Q4_K_M.gguf" - model_type = "mistral" - config = Config( - top_k=40, - top_p=0.95, - temperature=0.8, - repetition_penalty=1.1, - last_n_tokens=64, - seed=-1, - batch_size=8, - threads=-1, - max_new_tokens=1024, - stop=None, - stream=False, - reset=True, - context_length=2048, - gpu_layers=50, - mmap=True, - mlock=False, - ) - system_template = "You are a helpful, respectful and honest assistant." - prompt_template = """<|im_start|>system -{system} -<|im_end|> -<|im_start|>user -{question}<|im_end|> -<|im_start|>assistant -<|im_start|>system -""" - - -SUPPORTED_MODELS = {"zephyr": ZephyrSettings, "mistral": MistralSettings} - - -def get_models(): - return list(SUPPORTED_MODELS.keys()) - - -def get_model_setting(model_name: str): - model_settings = SUPPORTED_MODELS.get(model_name) - - # validate input - if model_settings is None: - raise KeyError(model_name + " is a not supported model") - - return model_settings - - -class Model: - """ - This Model class encapsulates the initialization of the language model and tokenizer, as well as the generation of - prompts and outputs. - You can create an instance of this class and use its methods to handle the specific tasks you need. - """ - - def __init__(self, model_folder: Path, model_settings: ModelSettings): - self.model_settings = model_settings - self.model_path = model_folder / self.model_settings.file_name - self.prompt_template = self.model_settings.prompt_template - self.system_template = self.model_settings.system_template - - self._auto_download() - - self.llm = AutoModelForCausalLM.from_pretrained( - model_path_or_repo_id=str(model_folder), - model_file=self.model_settings.file_name, - model_type=self.model_settings.model_type, - config=AutoConfig(config=self.model_settings.config), - hf=True, - ) - self.tokenizer = AutoTokenizer.from_pretrained(self.llm) - - def _auto_download(self) -> None: - """ - Downloads a model file based on the provided name and saves it to the specified path. - - Returns: - None - - Raises: - Any exceptions raised during the download process will be caught and printed, but not re-raised. - - This function fetches model settings using the provided name, including the model's URL, and then downloads - the model file from the URL. The download is done in chunks, and a progress bar is displayed to visualize - the download process. - - """ - file_name = self.model_settings.file_name - url = self.model_settings.url - - if not os.path.exists(self.model_path): - # send a GET request to the URL to download the file. - # Stream it while downloading, since the file is large - - try: - response = requests.get(url, stream=True) - # open the file in binary mode and write the contents of the response - # in chunks. - with open(self.model_path, "wb") as f: - for chunk in tqdm(response.iter_content(chunk_size=8912)): - if chunk: - f.write(chunk) - - except Exception as e: - print(f"=> Download Failed. Error: {e}") - return - - print(f"=> Model: {file_name} downloaded successfully 🥳") - - def generate_prompt(self, question): - return generate_prompt( - template=self.prompt_template, - system=self.system_template, - question=question, - ) - - def generate_output(self, prompt: str, max_new_tokens: int = 1000): - inputs = self.tokenizer(text=prompt, return_tensors="pt").input_ids - streamer = TextStreamer(tokenizer=self.tokenizer, skip_prompt=True) - output = self.llm.generate(inputs, streamer=streamer, max_new_tokens=max_new_tokens) - - return output diff --git a/experiments/exp_ctransformers/prompts.py b/experiments/exp_ctransformers/prompts.py deleted file mode 100644 index f961fb4..0000000 --- a/experiments/exp_ctransformers/prompts.py +++ /dev/null @@ -1,33 +0,0 @@ -def generate_prompt(template: str, system: str, question: str): - """ - Generate a prompt by formatting a template with provided parameters. - - Args: - template (str): The template string with placeholders for 'system', 'context', and 'question'. - system (str): The system information to be inserted into the template. - question (str): The question to be inserted into the template. - - Returns: - str: The formatted prompt string. - """ - - prompt = template.format(system=system, question=question) - return prompt - - -def generate_contextual_prompt(template: str, system: str, question: str, context: str = ""): - """ - Generate a prompt by formatting a template with provided parameters. - - Args: - template (str): The template string with placeholders for 'system', 'context', and 'question'. - system (str): The system information to be inserted into the template. - question (str): The question to be inserted into the template. - context (str): The context information to be inserted into the template. - - Returns: - str: The formatted prompt string. - """ - - prompt = template.format(system=system, context=context, question=question) - return prompt diff --git a/todo.md b/todo.md index dfd4f92..099d6f0 100644 --- a/todo.md +++ b/todo.md @@ -1,5 +1,4 @@ # Todo -- Try Chat Templates https://medium.com/@ahmet_celebi/demystifying-chat-templates-of-llm-using-llama-cpp-and-ctransformers-f17871569cd6 - Test Flash attention: - https://github.com/ggerganov/llama.cpp/pull/5021 - Google Search with LLM: diff --git a/version/ctransformers b/version/ctransformers deleted file mode 100644 index b326984..0000000 --- a/version/ctransformers +++ /dev/null @@ -1 +0,0 @@ -0.2.27