From d239353836c6de84a5d196988e48d2969e43af0f Mon Sep 17 00:00:00 2001
From: Umberto Griffo <1609440+umbertogriffo@users.noreply.github.com>
Date: Sat, 17 Aug 2024 16:26:02 +0100
Subject: [PATCH] refactor: get rid of ctransformers

---
 .github/workflows/ci.yaml                     |   5 -
 Makefile                                      |   9 +-
 README.md                                     |  25 +--
 .../explore_ctransformers.py                  |  26 ---
 experiments/exp_ctransformers/model.py        | 195 ------------------
 experiments/exp_ctransformers/prompts.py      |  33 ---
 todo.md                                       |   1 -
 version/ctransformers                         |   1 -
 8 files changed, 12 insertions(+), 283 deletions(-)
 delete mode 100644 experiments/exp_ctransformers/explore_ctransformers.py
 delete mode 100644 experiments/exp_ctransformers/model.py
 delete mode 100644 experiments/exp_ctransformers/prompts.py
 delete mode 100644 version/ctransformers

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 8904dae..f2fabcd 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -40,17 +40,12 @@ jobs:
         id: llama-cpp-version
         run: echo "llama-cpp-version=$(cat version/llama_cpp)" >> "$GITHUB_OUTPUT"
 
-      - name: Get ctransformers version
-        id: ctransformers-version
-        run: echo "ctransformers-version=$(cat version/ctransformers)" >> "$GITHUB_OUTPUT"
-
       # Installing dependencies and llama-cpp-python without NVIDIA CUDA acceleration.
       - name: Setup environment
         run: |
           poetry lock --check
           poetry install --no-root --no-ansi
           . .venv/bin/activate && pip3 install llama-cpp-python==${{ steps.llama-cpp-version.outputs.llama-cpp-version }}
-          . .venv/bin/activate && pip3 install ctransformers==${{ steps.ctransformers-version.outputs.ctransformers-version }}
 
       - name: Run tests
         run: |
diff --git a/Makefile b/Makefile
index b3863a1..efc0dce 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,6 @@
 llama_cpp_file=version/llama_cpp
 llama_cpp_version=`cat $(llama_cpp_file)`
 
-ctransformers_file=version/ctransformers
-ctransformers_version=`cat $(ctransformers_file)`
-
 check:
 	which pip3
 	which python3
@@ -15,18 +12,16 @@ install_cuda:
 	mkdir -p .venv
 	poetry config virtualenvs.in-project true
 	poetry install --extras "cuda-acceleration" --no-root --no-ansi
-	echo "Installing llama-cpp-python and ctransformers with pip to get NVIDIA CUDA acceleration"
+	echo "Installing llama-cpp-python with pip to get NVIDIA CUDA acceleration"
 	. .venv/bin/activate && CMAKE_ARGS="-DGGML_CUDA=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v
-	. .venv/bin/activate && pip3 install ctransformers[cuda]==$(ctransformers_version)
 
 install_metal:
 	echo "Installing..."
 	mkdir -p .venv
 	poetry config virtualenvs.in-project true
 	poetry install --no-root --no-ansi
-	echo "Installing llama-cpp-python and ctransformers with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)"
+	echo "Installing llama-cpp-python with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)"
 	. .venv/bin/activate && CMAKE_ARGS="-DGGML_METAL=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v
-	. .venv/bin/activate && CT_METAL=1 pip install ctransformers==$(ctransformers_version) --no-binary ctransformers
 
 install_pre_commit:
 	poetry run pre-commit install
diff --git a/README.md b/README.md
index bccf308..c68bdf6 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,8 @@
 >   * `MacOS Sonoma 14.3.1` running on a MacBook Pro M1 (2020).
 >
 > If you are using another Operating System or different hardware, and you can't load the models, please
-> take a look either at the official Llama Cpp Python's
+> take a look at the official Llama Cpp Python's
 > GitHub [issue](https://github.com/abetlen/llama-cpp-python/issues).
-> or at the official CTransformers's GitHub [issue](https://github.com/marella/ctransformers/issues)
 
 > [!WARNING]
 > Note: it's important to note that the large language model sometimes generates hallucinations or false information.
@@ -39,10 +38,8 @@
 ## Introduction
 
 This project combines the power
-of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [CTransformers](https://github.com/marella/ctransformers),
-[LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and
-querying the Vector Database, and we plan to eliminate it entirely),
-[Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build:
+of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and querying the Vector Database, and we plan to
+eliminate it entirely), [Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build:
 
 * a Conversation-aware Chatbot (ChatGPT like experience).
 * a RAG (Retrieval-augmented generation) ChatBot.
@@ -79,7 +76,7 @@ To deal with context overflows, we implemented three approaches:
 ## Prerequisites
 
 * Python 3.10+
-* GPU supporting CUDA 12.1, 12.2, 12.3, or 12.4
+* GPU supporting CUDA 12.1+
 * Poetry 1.7.0
 
 ### Install Poetry
@@ -124,14 +121,13 @@ To easily install the dependencies we created a make file.
 
 ## Using the Open-Source Models Locally
 
-We utilize two open-source libraries, [Lama.cpp](https://github.com/abetlen/llama-cpp-python)
-and [CTransformers](https://github.com/marella/ctransformers),
-which allow us to work efficiently with transformer-based models efficiently.
-Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of
-parameters. These libraries enable us to run them either on a `CPU` or `GPU`.
+We utilize the open-source library [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a binding for [llama-cpp](https://github.com/ggerganov/llama.cpp),
+allowing us to utilize it within a Python environment.
+`llama-cpp` serves as a C++ backend designed to work efficiently with transformer-based models.
+Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of parameters.
+This library enable us to run them either on a `CPU` or `GPU`.
 Additionally, we use the `Quantization and 4-bit precision` to reduce number of bits required to represent the numbers.
-The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c)
-format.
+The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c) format.
 
 ### Supported Models
 
@@ -244,7 +240,6 @@ streamlit run chatbot/rag_chatbot_app.py -- --model openchat-3.6 --k 2 --synthes
 * Open Source Repositories:
     * [llama.cpp](https://github.com/ggerganov/llama.cpp)
     * [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
-    * [CTransformers](https://github.com/marella/ctransformers)
     * [GPT4All](https://github.com/nomic-ai/gpt4all)
     * [pyllamacpp](https://github.com/abdeladim-s/pyllamacpp)
     * [chroma](https://github.com/chroma-core/chroma)
diff --git a/experiments/exp_ctransformers/explore_ctransformers.py b/experiments/exp_ctransformers/explore_ctransformers.py
deleted file mode 100644
index 128209c..0000000
--- a/experiments/exp_ctransformers/explore_ctransformers.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import time
-from pathlib import Path
-
-from exp_ctransformers.model import Model, get_model_setting
-
-if __name__ == "__main__":
-    root_folder = Path(__file__).resolve().parent.parent.parent
-    model_folder = root_folder / "models"
-    Path(model_folder).parent.mkdir(parents=True, exist_ok=True)
-
-    model_settings = get_model_setting("zephyr")
-
-    llm = Model(model_folder, model_settings)
-
-    # question_p = """What is the date for announcement"""
-    # context_p = """ On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt
-    # renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore."""
-
-    question_p = """Create a regex to extract dates from logs in Python"""
-
-    prompt = llm.generate_prompt(question=question_p)
-
-    start_time = time.time()
-    _ = llm.generate_output(prompt, max_new_tokens=1000)
-    took = time.time() - start_time
-    print(f"--- Took {took:.2f} seconds ---")
diff --git a/experiments/exp_ctransformers/model.py b/experiments/exp_ctransformers/model.py
deleted file mode 100644
index 73d321d..0000000
--- a/experiments/exp_ctransformers/model.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import os
-from abc import ABC
-from pathlib import Path
-
-import requests
-from ctransformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Config
-from exp_ctransformers.prompts import generate_prompt
-from tqdm import tqdm
-from transformers import TextStreamer
-
-
-class ModelSettings(ABC):
-    """
-    top_k="The top-k value to use for sampling."
-    top_p="The top-p value to use for sampling."
-    temperature="The temperature to use for sampling."
-    repetition_penalty="The repetition penalty to use for sampling."
-    last_n_tokens="The number of last tokens to use for repetition penalty."
-    seed="The seed value to use for sampling tokens."
-    max_new_tokens="The maximum number of new tokens to generate."
-    stop="A list of sequences to stop generation when encountered."
-    stream="Whether to stream the generated text."
-    reset="Whether to reset the model state before generating text."
-    batch_size="The batch size to use for evaluating tokens in a single prompt."
-    threads="The number of threads to use for evaluating tokens."
-    context_length="The maximum context length to use."
-    gpu_layers="The number of layers to run on GPU."
-        Set gpu_layers to the number of layers to offload to GPU.
-        Set to 0 if no GPU acceleration is available on your system.
-    """
-
-    url: str
-    file_name: str
-    model_type: str
-    system_template: str
-    prompt_template: str
-    config: Config
-
-
-class ZephyrSettings(ModelSettings):
-    url = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_K_M.gguf"
-    file_name = "zephyr-7b-beta.Q4_K_M.gguf"
-    model_type = "mistral"
-    config = Config(
-        top_k=40,
-        top_p=0.95,
-        temperature=0.8,
-        repetition_penalty=1.1,
-        last_n_tokens=64,
-        seed=-1,
-        batch_size=512,
-        threads=-1,
-        max_new_tokens=1024,
-        stop=None,
-        stream=False,
-        reset=True,
-        context_length=3048,
-        gpu_layers=50,
-        mmap=True,
-        mlock=False,
-    )
-    system_template = "You are a helpful, respectful and honest assistant. "
-    prompt_template = """<|system|> {system}
-Answer the question below:
-</s>
-<|user|>
-{question}</s>
-<|assistant|>
-"""
-
-
-class MistralSettings(ModelSettings):
-    url = "https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q4_K_M.gguf"
-    file_name = "mistral-7b-openorca.Q4_K_M.gguf"
-    model_type = "mistral"
-    config = Config(
-        top_k=40,
-        top_p=0.95,
-        temperature=0.8,
-        repetition_penalty=1.1,
-        last_n_tokens=64,
-        seed=-1,
-        batch_size=8,
-        threads=-1,
-        max_new_tokens=1024,
-        stop=None,
-        stream=False,
-        reset=True,
-        context_length=2048,
-        gpu_layers=50,
-        mmap=True,
-        mlock=False,
-    )
-    system_template = "You are a helpful, respectful and honest assistant."
-    prompt_template = """<|im_start|>system
-{system}
-<|im_end|>
-<|im_start|>user
-{question}<|im_end|>
-<|im_start|>assistant
-<|im_start|>system
-"""
-
-
-SUPPORTED_MODELS = {"zephyr": ZephyrSettings, "mistral": MistralSettings}
-
-
-def get_models():
-    return list(SUPPORTED_MODELS.keys())
-
-
-def get_model_setting(model_name: str):
-    model_settings = SUPPORTED_MODELS.get(model_name)
-
-    # validate input
-    if model_settings is None:
-        raise KeyError(model_name + " is a not supported model")
-
-    return model_settings
-
-
-class Model:
-    """
-    This Model class encapsulates the initialization of the language model and tokenizer, as well as the generation of
-    prompts and outputs.
-    You can create an instance of this class and use its methods to handle the specific tasks you need.
-    """
-
-    def __init__(self, model_folder: Path, model_settings: ModelSettings):
-        self.model_settings = model_settings
-        self.model_path = model_folder / self.model_settings.file_name
-        self.prompt_template = self.model_settings.prompt_template
-        self.system_template = self.model_settings.system_template
-
-        self._auto_download()
-
-        self.llm = AutoModelForCausalLM.from_pretrained(
-            model_path_or_repo_id=str(model_folder),
-            model_file=self.model_settings.file_name,
-            model_type=self.model_settings.model_type,
-            config=AutoConfig(config=self.model_settings.config),
-            hf=True,
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(self.llm)
-
-    def _auto_download(self) -> None:
-        """
-        Downloads a model file based on the provided name and saves it to the specified path.
-
-        Returns:
-            None
-
-        Raises:
-            Any exceptions raised during the download process will be caught and printed, but not re-raised.
-
-        This function fetches model settings using the provided name, including the model's URL, and then downloads
-        the model file from the URL. The download is done in chunks, and a progress bar is displayed to visualize
-        the download process.
-
-        """
-        file_name = self.model_settings.file_name
-        url = self.model_settings.url
-
-        if not os.path.exists(self.model_path):
-            # send a GET request to the URL to download the file.
-            # Stream it while downloading, since the file is large
-
-            try:
-                response = requests.get(url, stream=True)
-                # open the file in binary mode and write the contents of the response
-                # in chunks.
-                with open(self.model_path, "wb") as f:
-                    for chunk in tqdm(response.iter_content(chunk_size=8912)):
-                        if chunk:
-                            f.write(chunk)
-
-            except Exception as e:
-                print(f"=> Download Failed. Error: {e}")
-                return
-
-            print(f"=> Model: {file_name} downloaded successfully 🥳")
-
-    def generate_prompt(self, question):
-        return generate_prompt(
-            template=self.prompt_template,
-            system=self.system_template,
-            question=question,
-        )
-
-    def generate_output(self, prompt: str, max_new_tokens: int = 1000):
-        inputs = self.tokenizer(text=prompt, return_tensors="pt").input_ids
-        streamer = TextStreamer(tokenizer=self.tokenizer, skip_prompt=True)
-        output = self.llm.generate(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-
-        return output
diff --git a/experiments/exp_ctransformers/prompts.py b/experiments/exp_ctransformers/prompts.py
deleted file mode 100644
index f961fb4..0000000
--- a/experiments/exp_ctransformers/prompts.py
+++ /dev/null
@@ -1,33 +0,0 @@
-def generate_prompt(template: str, system: str, question: str):
-    """
-    Generate a prompt by formatting a template with provided parameters.
-
-    Args:
-        template (str): The template string with placeholders for 'system', 'context', and 'question'.
-        system (str): The system information to be inserted into the template.
-        question (str): The question to be inserted into the template.
-
-    Returns:
-        str: The formatted prompt string.
-    """
-
-    prompt = template.format(system=system, question=question)
-    return prompt
-
-
-def generate_contextual_prompt(template: str, system: str, question: str, context: str = ""):
-    """
-    Generate a prompt by formatting a template with provided parameters.
-
-    Args:
-        template (str): The template string with placeholders for 'system', 'context', and 'question'.
-        system (str): The system information to be inserted into the template.
-        question (str): The question to be inserted into the template.
-        context (str): The context information to be inserted into the template.
-
-    Returns:
-        str: The formatted prompt string.
-    """
-
-    prompt = template.format(system=system, context=context, question=question)
-    return prompt
diff --git a/todo.md b/todo.md
index dfd4f92..099d6f0 100644
--- a/todo.md
+++ b/todo.md
@@ -1,5 +1,4 @@
 # Todo
-- Try Chat Templates https://medium.com/@ahmet_celebi/demystifying-chat-templates-of-llm-using-llama-cpp-and-ctransformers-f17871569cd6
 - Test Flash attention:
   - https://github.com/ggerganov/llama.cpp/pull/5021
 - Google Search with LLM:
diff --git a/version/ctransformers b/version/ctransformers
deleted file mode 100644
index b326984..0000000
--- a/version/ctransformers
+++ /dev/null
@@ -1 +0,0 @@
-0.2.27