diff --git a/applications/holochat_local/Dockerfile b/applications/holochat_local/Dockerfile index 46c1135a8..23187038a 100644 --- a/applications/holochat_local/Dockerfile +++ b/applications/holochat_local/Dockerfile @@ -23,6 +23,7 @@ WORKDIR /workspace COPY requirements.txt /tmp/requirements.txt RUN python3 -m pip install --no-cache-dir -r /tmp/requirements.txt +# Clone Llama.cpp and checkout a stable commit RUN git clone https://github.com/ggerganov/llama.cpp.git \ && cd llama.cpp \ && git checkout cf9b08485c4c2d4d945c6e74fe20f273a38b6104 \ diff --git a/applications/holochat_local/Makefile b/applications/holochat_local/Makefile index a28c110c4..1d723e238 100644 --- a/applications/holochat_local/Makefile +++ b/applications/holochat_local/Makefile @@ -13,13 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Target that runs all commands in order to run HoloChat .PHONY: run_holochat run_holochat: build_llamaCpp build_db download_llama start_holochat +# Creates container used for HoloChat and compiles Llama.cpp .PHONY: build_llamaCpp build_llamaCpp: docker build --ulimit memlock=-1 --ulimit stack=67108864 -t holochat . +# Creates the vector database used by HoloChat .PHONY: build_db build_db: mkdir -p holochat/embeddings @@ -36,20 +39,29 @@ build_db: holochat \ python3 build_holoscan_db.py \ +# Downloads the Llama-2 model used by HoloChat .PHONY: download_llama download_llama: mkdir -p holochat/docs wget -nc -P ./holochat/models https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf +# Runs HoloChat inside the pytorch container .PHONY: start_holochat start_holochat: docker run --rm -it \ -p 7860:7860 \ -p 8080:8080 \ --gpus all \ - --ipc=host --ulimit memlock=-1 \ + --ipc=host \ + --ulimit memlock=-1 \ --ulimit stack=67108864 \ -v ./holochat:/holochat \ -w /holochat \ holochat \ - bash -c "/workspace/llama.cpp/build/bin/server -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf --host 0.0.0.0 -ngl 1000 -c 4096 --alias llama_2 & python3 -u chatbot.py" + bash -c "/workspace/llama.cpp/build/bin/server \ + -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf \ + --host 0.0.0.0 \ + -ngl 1000 \ + -c 4096 \ + --alias llama_2 \ + & python3 -u chatbot.py" \ No newline at end of file diff --git a/applications/holochat_local/README.md b/applications/holochat_local/README.md index 15329f228..57a22ed85 100644 --- a/applications/holochat_local/README.md +++ b/applications/holochat_local/README.md @@ -1,6 +1,6 @@ # HoloChat-local -HoloChat-local is an AI-driven chatbot, built on top of a local Code Llama model running on IGX Orin. The chatbot leverages vector databases to generate human-like responses and write code. +HoloChat-local is an AI-driven chatbot, built on top of a locally hosted Code Llama model which acts as developer's copilot in Holoscan development. The Code Llama model leverages a vector database comprised of the Holoscan SDK repository and user guide, enabling HoloChat to answer general questions about Holoscan, as well act as a Holoscan SDK coding assistant.

HoloChat Demo diff --git a/applications/holochat_local/holochat/build_holoscan_db.py b/applications/holochat_local/holochat/build_holoscan_db.py index 3cd5311a0..1e1353742 100644 --- a/applications/holochat_local/holochat/build_holoscan_db.py +++ b/applications/holochat_local/holochat/build_holoscan_db.py @@ -35,6 +35,8 @@ def main(): content_lists = {file_type: [] for file_type in file_types} total_files = 0 + + # Loop over each repo and create a Document for each file found for repo in repos: clone_repository(repo, "") for file_type in file_types: @@ -53,6 +55,7 @@ def main(): ) ) + # Loop over the user guide and create a Document for each page content_lists[".pdf"] = [] for doc in docs: loader = PyPDFLoader(doc) @@ -77,6 +80,7 @@ def main(): Document(page_content=page_content, metadata={"userguide": doc}) ) + # Dictionary used to map file type to language ext_to_language = { ".py": "python", ".cpp": "cpp", @@ -95,6 +99,7 @@ def main(): model_kwargs = {"device": "cuda"} encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity + # Create local embedding model cached at ./models embedding_model = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, diff --git a/applications/holochat_local/holochat/chatbot.py b/applications/holochat_local/holochat/chatbot.py index 7f52ef39c..c81b71a9f 100644 --- a/applications/holochat_local/holochat/chatbot.py +++ b/applications/holochat_local/holochat/chatbot.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Simple Gradio Chatbot app, for details visit: +# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks + import gradio as gr import sklearn from llm import LLM @@ -21,10 +24,6 @@ initial_prompt = "Welcome to HoloChat! How can I assist you today?" -python_prompt = """Create a Python Holoscan 'hello world' app with video - as input, use HoloViz to print 'Hello World' on each frame, and then output - it to the user. After the code explain the process step-by-step.""" - def ask_question(message, chat_history): if chat_history is None: @@ -39,9 +38,7 @@ def ask_question(message, chat_history): def stream_response(chat_history, llm): if llm is None: llm = LLM() - response = llm.answer_question(chat_history) - for chunk in response: yield chunk, llm @@ -101,7 +98,12 @@ def main(): ["What operating system can I use with the Holoscan SDK?"], ["What hardware does Holoscan support?"], ["How do I create a C++ Holoscan Operator?"], - [python_prompt], + [ + "Create a Python Holoscan 'hello world' app with video " + "as input, use HoloViz to print 'Hello World' on each frame, " + "and then output it to the user. After the code explain the " + "process step-by-step." + ], ], inputs=tbInput, ) diff --git a/applications/holochat_local/holochat/llm.py b/applications/holochat_local/holochat/llm.py index 9da65e930..9602a902e 100644 --- a/applications/holochat_local/holochat/llm.py +++ b/applications/holochat_local/holochat/llm.py @@ -22,25 +22,27 @@ from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.vectorstores import Chroma -# Llama-2 has context length of 4096 token -# 1 token = ~4 characters, so 3500 * 4 provides plenty of room. +# Most Llama-2 models are trained with a context length of 4096 tokens +# 1 token = ~4 characters, so 3300 * 4 provides plenty of room. MAX_TOKENS = 3300 * 4 -# Empirically found to be the cutoff of specific questions vs. generic comments about previous answer +# Empirically found to be the cutoff of a specific questions vs. generic comments about previous answer +# This ensures no documents are returned for comments such as "Rewrite that code in one block" SEARCH_THRESHOLD = 0.35 NUM_HOLOSCAN_DOCS = 7 LLAMA_SERVER = "http://127.0.0.1:8080" -SERVER_TIMEOUT = 60 # seconds +SERVER_TIMEOUT = 60 # Timeout in seconds to connect to llama.cpp -system_prompt = """You are NVIDIA-GPT, an expert at all things NVIDIA who knows - the Holoscan user guide, as well as examples from Holohub and the api from the SDK. - You are an assistant who answers questions step-by-step and always provides your - reasoning so you have the correct result. Answer the questions based on the provided - context and augment with your general knowledge where appropriate. Reformat the provided - code examples as necessary since they were retrieved with a web scrape. - Under no circumstances will you make up Holoscan API functions or functionality that does not exist! - Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response with ''. - Below is NVIDIA Holoscan SDK documentation to assist you in answering questions: -""" +system_prompt = ( + "You are NVIDIA-GPT, an expert at all things NVIDIA who knows " + "the Holoscan user guide, as well as examples from Holohub and the api from the SDK. " + "You are an assistant who answers questions step-by-step and always provides your " + "reasoning so you have the correct result. Answer the questions based on the provided " + "context and augment with your general knowledge where appropriate. Reformat the provided " + "code examples as necessary since they were retrieved with a web scrape. " + "Under no circumstances will you make up Holoscan API functions or functionality that does not " + "exist! Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response " + "with ''. Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:" +) class LLM: @@ -54,7 +56,7 @@ def answer_question(self, chat_history): docs = self.db.similarity_search_with_score( query=question, k=NUM_HOLOSCAN_DOCS, distance_metric="cos" ) - # Filter out poor matches + # Filter out poor matches from vector db docs = list( map(lambda lc_doc: lc_doc[0], filter(lambda lc_doc: lc_doc[1] < SEARCH_THRESHOLD, docs)) ) @@ -65,6 +67,7 @@ def answer_question(self, chat_history): ] # Get first docs (highest similarity score) self.prev_docs = docs # Save document list + # Create a prompt to send to the llm (Remove greeting and question) llama_prompt = _to_llama_prompt(chat_history[1:-1], question, docs) response = self._stream_ai_response(llama_prompt, chat_history) @@ -72,6 +75,7 @@ def answer_question(self, chat_history): yield chunk def _stream_ai_response(self, llama_prompt, chat_history): + # Llama-specific request data request_data = { "prompt": llama_prompt, "temperature": 0, @@ -99,33 +103,49 @@ def _get_database(self): model_kwargs = {"device": "cuda"} encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity + # Construct embedding model and cache to local './models' dir embedding_model = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, cache_folder="./models", ) - # Use past two questions to get docs chroma_db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embedding_model) return chroma_db def _to_llama_prompt(history, question, docs): + """ + Function that takes the chat history, current question, and the documents + from the vector db and creates a single string to prompt the Llama model with + """ + + # Phind v2's prompt prefixes (Note these are dependent on the model used) user_prefix = "### User Message:" bot_prefix = "### Assistant:" bot_rule_prefix = "### System Prompt:" - opening_prompt = f"""Below is a chat between a user '{user_prefix}', and you, the AI - assistant '{bot_prefix}'. You follow the given rule '{bot_rule_prefix}' no matter what.""" + # Explain the context of the information being provided + opening_prompt = ( + f"Below is a chat between a user '{user_prefix}', and you, " + "the AI assistant '{bot_prefix}'. You follow the given rule " + "'{bot_rule_prefix}' no matter what." + ) + # Combine all the vector db docs into a single string docs = "\n\n".join(list(map(lambda lc_doc: lc_doc.page_content, docs))) + # Add the system prompt with the vector db docs opening_prompt += f"\n\n{bot_rule_prefix}\n{system_prompt}\n\n{docs}" - ending_prompt = f"""\n\n{user_prefix}\nUsing the previous conversation history, - the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer - the following question (include markdown code snippets for coding questions and do not acknowledge - that documentation was provided to you):\n{question}""" - + # Define the final portion of the prompt + ending_prompt = ( + f"\n\n{user_prefix}\nUsing the previous conversation history, " + "the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer " + "the following question (include markdown code snippets for coding questions and do not " + f"acknowledge that documentation was provided to you):\n{question}" + ) + + # Loop over the chat history and convert it to a single string msg_hist = "" for msg_pair in history: if msg_pair[0]: @@ -135,23 +155,30 @@ def _to_llama_prompt(history, question, docs): len_prompt = len(msg_hist) + len(opening_prompt) + len(ending_prompt) - # Remove previous conversation history if MAX_TOKENS exceeded + # Truncate previous conversation history if MAX_TOKENS exceeded if len_prompt > MAX_TOKENS: excess_tokens = len_prompt - MAX_TOKENS msg_hist = msg_hist[excess_tokens:] last_msg_idx = msg_hist.find("\n\n" + user_prefix) bot_idx = msg_hist.find("\n\n" + bot_prefix) + # Truncate to the last user or bot message, which ever allows for a + # longer chat history if bot_idx < last_msg_idx: last_msg_idx = bot_idx msg_hist = msg_hist[last_msg_idx:] + # Create the final prompt prompt = opening_prompt + msg_hist + ending_prompt + f"\n\n{bot_prefix}\n" return prompt def _wait_for_server(): + """ + Method that attempts to connect to the llama.cpp server + for up to SERVER_TIMEOUT until throwing an exception + """ attempts = 0 - while attempts < SEARCH_THRESHOLD / 5: + while attempts < SERVER_TIMEOUT / 5: try: response = requests.get(LLAMA_SERVER) # Check for a successful response status code (e.g., 200 OK) diff --git a/applications/holochat_local/holochat/utils.py b/applications/holochat_local/holochat/utils.py index e6e4cd2b6..912113911 100644 --- a/applications/holochat_local/holochat/utils.py +++ b/applications/holochat_local/holochat/utils.py @@ -25,6 +25,9 @@ def clone_repository(repo, token): + """ + Used to clone nvidia-holoscan repos + """ print(f"Cloning repository: {repo}") time.sleep(1) try: @@ -39,6 +42,9 @@ def clone_repository(repo, token): def clone_general_repository(repo, token): + """ + Used to clone general repos + """ print(f"Cloning repository: {repo}") time.sleep(1) try: @@ -81,8 +87,12 @@ def get_files(files, type): return contents -# langchain method can't handle 'disallowed_special' - use tiktoken for now def get_source_chunks(all_contents, file_type=None, chunk_size=1500, chunk_overlap=150): + """ + Method that splits Documents into chunks for storage. If the language is supported, + it is split according to the syntax of that language (Ex: not splitting python + functions in the middle) + """ if file_type in ["python", "cpp", "markdown"]: splitter = RecursiveCharacterTextSplitter.from_language( language=file_type, chunk_size=chunk_size, chunk_overlap=chunk_overlap