refactor: get rid of ctransformers

umbertogriffo · Aug 17, 2024 · d239353 · d239353
1 parent f599ef2
commit d239353
Show file tree

Hide file tree

Showing 8 changed files with 12 additions and 283 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -40,17 +40,12 @@ jobs:
         id: llama-cpp-version
         run: echo "llama-cpp-version=$(cat version/llama_cpp)" >> "$GITHUB_OUTPUT"
 
-      - name: Get ctransformers version
-        id: ctransformers-version
-        run: echo "ctransformers-version=$(cat version/ctransformers)" >> "$GITHUB_OUTPUT"
-
       # Installing dependencies and llama-cpp-python without NVIDIA CUDA acceleration.
       - name: Setup environment
         run: |
           poetry lock --check
           poetry install --no-root --no-ansi
           . .venv/bin/activate && pip3 install llama-cpp-python==${{ steps.llama-cpp-version.outputs.llama-cpp-version }}
-          . .venv/bin/activate && pip3 install ctransformers==${{ steps.ctransformers-version.outputs.ctransformers-version }}
 
       - name: Run tests
         run: |

diff --git a/Makefile b/Makefile
@@ -3,9 +3,6 @@
 llama_cpp_file=version/llama_cpp
 llama_cpp_version=`cat $(llama_cpp_file)`
 
-ctransformers_file=version/ctransformers
-ctransformers_version=`cat $(ctransformers_file)`
-
 check:
 	which pip3
 	which python3
@@ -15,18 +12,16 @@ install_cuda:
 	mkdir -p .venv
 	poetry config virtualenvs.in-project true
 	poetry install --extras "cuda-acceleration" --no-root --no-ansi
-	echo "Installing llama-cpp-python and ctransformers with pip to get NVIDIA CUDA acceleration"
+	echo "Installing llama-cpp-python with pip to get NVIDIA CUDA acceleration"
 	. .venv/bin/activate && CMAKE_ARGS="-DGGML_CUDA=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v
-	. .venv/bin/activate && pip3 install ctransformers[cuda]==$(ctransformers_version)
 
 install_metal:
 	echo "Installing..."
 	mkdir -p .venv
 	poetry config virtualenvs.in-project true
 	poetry install --no-root --no-ansi
-	echo "Installing llama-cpp-python and ctransformers with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)"
+	echo "Installing llama-cpp-python with pip to get Metal GPU acceleration for macOS systems only (it doesn't install CUDA dependencies)"
 	. .venv/bin/activate && CMAKE_ARGS="-DGGML_METAL=on" pip3 install llama-cpp-python==$(llama_cpp_version) -v
-	. .venv/bin/activate && CT_METAL=1 pip install ctransformers==$(ctransformers_version) --no-binary ctransformers
 
 install_pre_commit:
 	poetry run pre-commit install

diff --git a/README.md b/README.md
@@ -12,9 +12,8 @@
 >   * `MacOS Sonoma 14.3.1` running on a MacBook Pro M1 (2020).
 >
 > If you are using another Operating System or different hardware, and you can't load the models, please
-> take a look either at the official Llama Cpp Python's
+> take a look at the official Llama Cpp Python's
 > GitHub [issue](https://github.com/abetlen/llama-cpp-python/issues).
-> or at the official CTransformers's GitHub [issue](https://github.com/marella/ctransformers/issues)
 
 > [!WARNING]
 > Note: it's important to note that the large language model sometimes generates hallucinations or false information.
@@ -39,10 +38,8 @@
 ## Introduction
 
 This project combines the power
-of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [CTransformers](https://github.com/marella/ctransformers),
-[LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and
-querying the Vector Database, and we plan to eliminate it entirely),
-[Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build:
+of [Lama.cpp](https://github.com/abetlen/llama-cpp-python), [LangChain](https://python.langchain.com/docs/get_started/introduction.html) (only used for document chunking and querying the Vector Database, and we plan to
+eliminate it entirely), [Chroma](https://github.com/chroma-core/chroma) and [Streamlit](https://discuss.streamlit.io/) to build:
 
 * a Conversation-aware Chatbot (ChatGPT like experience).
 * a RAG (Retrieval-augmented generation) ChatBot.
@@ -79,7 +76,7 @@ To deal with context overflows, we implemented three approaches:
 ## Prerequisites
 
 * Python 3.10+
-* GPU supporting CUDA 12.1, 12.2, 12.3, or 12.4
+* GPU supporting CUDA 12.1+
 * Poetry 1.7.0
 
 ### Install Poetry
@@ -124,14 +121,13 @@ To easily install the dependencies we created a make file.
 
 ## Using the Open-Source Models Locally
 
-We utilize two open-source libraries, [Lama.cpp](https://github.com/abetlen/llama-cpp-python)
-and [CTransformers](https://github.com/marella/ctransformers),
-which allow us to work efficiently with transformer-based models efficiently.
-Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of
-parameters. These libraries enable us to run them either on a `CPU` or `GPU`.
+We utilize the open-source library [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a binding for [llama-cpp](https://github.com/ggerganov/llama.cpp),
+allowing us to utilize it within a Python environment.
+`llama-cpp` serves as a C++ backend designed to work efficiently with transformer-based models.
+Running the LLMs architecture on a local PC is impossible due to the large (~7 billion) number of parameters.
+This library enable us to run them either on a `CPU` or `GPU`.
 Additionally, we use the `Quantization and 4-bit precision` to reduce number of bits required to represent the numbers.
-The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c)
-format.
+The quantized models are stored in [GGML/GGUF](https://medium.com/@phillipgimmi/what-is-gguf-and-ggml-e364834d241c) format.
 
 ### Supported Models
 
@@ -244,7 +240,6 @@ streamlit run chatbot/rag_chatbot_app.py -- --model openchat-3.6 --k 2 --synthes
 * Open Source Repositories:
     * [llama.cpp](https://github.com/ggerganov/llama.cpp)
     * [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
-    * [CTransformers](https://github.com/marella/ctransformers)
     * [GPT4All](https://github.com/nomic-ai/gpt4all)
     * [pyllamacpp](https://github.com/abdeladim-s/pyllamacpp)
     * [chroma](https://github.com/chroma-core/chroma)

diff --git a/experiments/exp_ctransformers/explore_ctransformers.py b/experiments/exp_ctransformers/explore_ctransformers.py
diff --git a/experiments/exp_ctransformers/model.py b/experiments/exp_ctransformers/model.py
diff --git a/experiments/exp_ctransformers/prompts.py b/experiments/exp_ctransformers/prompts.py
diff --git a/todo.md b/todo.md
@@ -1,5 +1,4 @@
 # Todo
-- Try Chat Templates https://medium.com/@ahmet_celebi/demystifying-chat-templates-of-llm-using-llama-cpp-and-ctransformers-f17871569cd6
 - Test Flash attention:
   - https://github.com/ggerganov/llama.cpp/pull/5021
 - Google Search with LLM:

diff --git a/version/ctransformers b/version/ctransformers