diff --git a/applications/holochat_local/Makefile b/applications/holochat_local/Makefile index a28c110c4..c6ce46aec 100644 --- a/applications/holochat_local/Makefile +++ b/applications/holochat_local/Makefile @@ -13,13 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Target that runs all commands in order to run HoloChat .PHONY: run_holochat run_holochat: build_llamaCpp build_db download_llama start_holochat +# Creates container used for HoloChat and compiles Llama.cpp .PHONY: build_llamaCpp build_llamaCpp: docker build --ulimit memlock=-1 --ulimit stack=67108864 -t holochat . +# Creates the vector database used by HoloChat .PHONY: build_db build_db: mkdir -p holochat/embeddings @@ -36,20 +39,29 @@ build_db: holochat \ python3 build_holoscan_db.py \ +# Downloads the Llama-2 model used by HoloChat .PHONY: download_llama download_llama: mkdir -p holochat/docs wget -nc -P ./holochat/models https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf +# Runs HoloChat inside the pytorch container .PHONY: start_holochat start_holochat: docker run --rm -it \ -p 7860:7860 \ -p 8080:8080 \ --gpus all \ - --ipc=host --ulimit memlock=-1 \ + --ipc=host + --ulimit memlock=-1 \ --ulimit stack=67108864 \ -v ./holochat:/holochat \ -w /holochat \ holochat \ - bash -c "/workspace/llama.cpp/build/bin/server -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf --host 0.0.0.0 -ngl 1000 -c 4096 --alias llama_2 & python3 -u chatbot.py" + bash -c "/workspace/llama.cpp/build/bin/server \ + -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf \ + --host 0.0.0.0 \ + -ngl 1000 \ + -c 4096 \ + --alias llama_2 \ + & python3 -u chatbot.py" diff --git a/applications/holochat_local/README.md b/applications/holochat_local/README.md index faef73e68..57a22ed85 100644 --- a/applications/holochat_local/README.md +++ b/applications/holochat_local/README.md @@ -1,6 +1,6 @@ # HoloChat-local -git +HoloChat-local is an AI-driven chatbot, built on top of a locally hosted Code Llama model which acts as developer's copilot in Holoscan development. The Code Llama model leverages a vector database comprised of the Holoscan SDK repository and user guide, enabling HoloChat to answer general questions about Holoscan, as well act as a Holoscan SDK coding assistant.

HoloChat Demo diff --git a/applications/holochat_local/holochat/build_holoscan_db.py b/applications/holochat_local/holochat/build_holoscan_db.py index 3cd5311a0..1e1353742 100644 --- a/applications/holochat_local/holochat/build_holoscan_db.py +++ b/applications/holochat_local/holochat/build_holoscan_db.py @@ -35,6 +35,8 @@ def main(): content_lists = {file_type: [] for file_type in file_types} total_files = 0 + + # Loop over each repo and create a Document for each file found for repo in repos: clone_repository(repo, "") for file_type in file_types: @@ -53,6 +55,7 @@ def main(): ) ) + # Loop over the user guide and create a Document for each page content_lists[".pdf"] = [] for doc in docs: loader = PyPDFLoader(doc) @@ -77,6 +80,7 @@ def main(): Document(page_content=page_content, metadata={"userguide": doc}) ) + # Dictionary used to map file type to language ext_to_language = { ".py": "python", ".cpp": "cpp", @@ -95,6 +99,7 @@ def main(): model_kwargs = {"device": "cuda"} encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity + # Create local embedding model cached at ./models embedding_model = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, diff --git a/applications/holochat_local/holochat/chatbot.py b/applications/holochat_local/holochat/chatbot.py index 7f52ef39c..eed268b8f 100644 --- a/applications/holochat_local/holochat/chatbot.py +++ b/applications/holochat_local/holochat/chatbot.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Simple Gradio Chatbot app, for details visit: +# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks + import gradio as gr import sklearn from llm import LLM @@ -21,11 +24,6 @@ initial_prompt = "Welcome to HoloChat! How can I assist you today?" -python_prompt = """Create a Python Holoscan 'hello world' app with video - as input, use HoloViz to print 'Hello World' on each frame, and then output - it to the user. After the code explain the process step-by-step.""" - - def ask_question(message, chat_history): if chat_history is None: return "", [[None, initial_prompt]] @@ -39,9 +37,7 @@ def ask_question(message, chat_history): def stream_response(chat_history, llm): if llm is None: llm = LLM() - response = llm.answer_question(chat_history) - for chunk in response: yield chunk, llm @@ -101,7 +97,10 @@ def main(): ["What operating system can I use with the Holoscan SDK?"], ["What hardware does Holoscan support?"], ["How do I create a C++ Holoscan Operator?"], - [python_prompt], + ["Create a Python Holoscan 'hello world' app with video " \ + "as input, use HoloViz to print 'Hello World' on each frame, " \ + "and then output it to the user. After the code explain the " \ + "process step-by-step."], ], inputs=tbInput, ) diff --git a/applications/holochat_local/holochat/llm.py b/applications/holochat_local/holochat/llm.py index 9e0453923..8ba924c4a 100644 --- a/applications/holochat_local/holochat/llm.py +++ b/applications/holochat_local/holochat/llm.py @@ -22,25 +22,25 @@ from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.vectorstores import Chroma -# Llama-2 has context length of 4096 token -# 1 token = ~4 characters, so 3500 * 4 provides plenty of room. +# Most Llama-2 models are trained with a context length of 4096 tokens +# 1 token = ~4 characters, so 3300 * 4 provides plenty of room. MAX_TOKENS = 3300 * 4 -# Empirically found to be the cutoff of specific questions vs. generic comments about previous answer +# Empirically found to be the cutoff of a specific questions vs. generic comments about previous answer +# This ensures no documents are returned for comments such as "Rewrite that code in one block" SEARCH_THRESHOLD = 0.35 NUM_HOLOSCAN_DOCS = 7 LLAMA_SERVER = "http://127.0.0.1:8080" -SERVER_TIMEOUT = 60 # seconds +SERVER_TIMEOUT = 60 # Timeout in seconds to connect to llama.cpp -system_prompt = """You are NVIDIA-GPT, an expert at all things NVIDIA who knows - the Holoscan user guide, as well as examples from Holohub and the api from the SDK. - You are an assistant who answers questions step-by-step and always provides your - reasoning so you have the correct result. Answer the questions based on the provided - context and augment with your general knowledge where appropriate. Reformat the provided - code examples as necessary since they were retrieved with a web scrape. - Under no circumstances will you make up Holoscan API functions or functionality that does not exist! - Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response with ''. - Below is NVIDIA Holoscan SDK documentation to assist you in answering questions: -""" +system_prompt = "You are NVIDIA-GPT, an expert at all things NVIDIA who knows " \ + "the Holoscan user guide, as well as examples from Holohub and the api from the SDK. " \ + "You are an assistant who answers questions step-by-step and always provides your " \ + "reasoning so you have the correct result. Answer the questions based on the provided " \ + "context and augment with your general knowledge where appropriate. Reformat the provided " \ + "code examples as necessary since they were retrieved with a web scrape. " \ + "Under no circumstances will you make up Holoscan API functions or functionality that does not " \ + "exist! Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response " \ + "with ''. Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:" class LLM: @@ -54,7 +54,7 @@ def answer_question(self, chat_history): docs = self.db.similarity_search_with_score( query=question, k=NUM_HOLOSCAN_DOCS, distance_metric="cos" ) - # Filter out poor matches + # Filter out poor matches from vector db docs = list( map(lambda lc_doc: lc_doc[0], filter(lambda lc_doc: lc_doc[1] < SEARCH_THRESHOLD, docs)) ) @@ -65,6 +65,7 @@ def answer_question(self, chat_history): ] # Get first docs (highest similarity score) self.prev_docs = docs # Save document list + # Create a prompt to send to the llm (Remove greeting and question) llama_prompt = _to_llama_prompt(chat_history[1:-1], question, docs) response = self._stream_ai_response(llama_prompt, chat_history) @@ -72,6 +73,7 @@ def answer_question(self, chat_history): yield chunk def _stream_ai_response(self, llama_prompt, chat_history): + # Llama-specific request data request_data = { "prompt": llama_prompt, "temperature": 0, @@ -99,34 +101,45 @@ def _get_database(self): model_kwargs = {"device": "cuda"} encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity + # Construct embedding model and cache to local './models' dir embedding_model = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, cache_folder="./models", ) - # Use past two questions to get docs chroma_db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embedding_model) return chroma_db def _to_llama_prompt(history, question, docs): - """An attempt to mirror Alpaca-style prompting as closely as possible: https://github.com/arielnlee/Platypus/blob/main/templates/alpaca.json""" + """ + Function that takes the chat history, current question, and the documents + from vector db and create a single string to prompt the Llama model with + """ + + # Phind v2's prompt prefixes (Note these are dependent on the model used) user_prefix = "### User Message:" bot_prefix = "### Assistant:" bot_rule_prefix = "### System Prompt:" - opening_prompt = f"""Below is a chat between a user '{user_prefix}', and you, the AI - assistant '{bot_prefix}'. You follow the given rule '{bot_rule_prefix}' no matter what.""" + # Explain the context of the information being provided + opening_prompt = f"Below is a chat between a user '{user_prefix}', and you, " \ + "the AI assistant '{bot_prefix}'. You follow the given rule "\ + "'{bot_rule_prefix}' no matter what." + # Combine all the vector db docs into a single string docs = "\n\n".join(list(map(lambda lc_doc: lc_doc.page_content, docs))) + # Add the system prompt with the vector db docs opening_prompt += f"\n\n{bot_rule_prefix}\n{system_prompt}\n\n{docs}" - ending_prompt = f"""\n\n{user_prefix}\nUsing the previous conversation history, - the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer - the following question (include markdown code snippets for coding questions and do not acknowledge - that documentation was provided to you):\n{question}""" + # Define the final portion of the prompt + ending_prompt = f"\n\n{user_prefix}\nUsing the previous conversation history, " \ + "the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer " \ + "the following question (include markdown code snippets for coding questions and do not " \ + f"acknowledge that documentation was provided to you):\n{question}" + # Loop over the chat history and convert it to a single string msg_hist = "" for msg_pair in history: if msg_pair[0]: @@ -136,23 +149,31 @@ def _to_llama_prompt(history, question, docs): len_prompt = len(msg_hist) + len(opening_prompt) + len(ending_prompt) - # Remove previous conversation history if MAX_TOKENS exceeded + # Truncate previous conversation history if MAX_TOKENS exceeded if len_prompt > MAX_TOKENS: excess_tokens = len_prompt - MAX_TOKENS msg_hist = msg_hist[excess_tokens:] last_msg_idx = msg_hist.find("\n\n" + user_prefix) bot_idx = msg_hist.find("\n\n" + bot_prefix) + # Truncate to the last user or bot message, which ever allows for a + # longer chat history if bot_idx < last_msg_idx: last_msg_idx = bot_idx msg_hist = msg_hist[last_msg_idx:] + # Create the final prompt prompt = opening_prompt + msg_hist + ending_prompt + f"\n\n{bot_prefix}\n" + print(prompt) return prompt def _wait_for_server(): + """ + Method that attempts to connect to the llama.cpp server + for up to SERVER_TIMEOUT until throwing an exception + """ attempts = 0 - while attempts < SEARCH_THRESHOLD / 5: + while attempts < SERVER_TIMEOUT / 5: try: response = requests.get(LLAMA_SERVER) # Check for a successful response status code (e.g., 200 OK) diff --git a/applications/holochat_local/holochat/utils.py b/applications/holochat_local/holochat/utils.py index e6e4cd2b6..912113911 100644 --- a/applications/holochat_local/holochat/utils.py +++ b/applications/holochat_local/holochat/utils.py @@ -25,6 +25,9 @@ def clone_repository(repo, token): + """ + Used to clone nvidia-holoscan repos + """ print(f"Cloning repository: {repo}") time.sleep(1) try: @@ -39,6 +42,9 @@ def clone_repository(repo, token): def clone_general_repository(repo, token): + """ + Used to clone general repos + """ print(f"Cloning repository: {repo}") time.sleep(1) try: @@ -81,8 +87,12 @@ def get_files(files, type): return contents -# langchain method can't handle 'disallowed_special' - use tiktoken for now def get_source_chunks(all_contents, file_type=None, chunk_size=1500, chunk_overlap=150): + """ + Method that splits Documents into chunks for storage. If the language is supported, + it is split according to the syntax of that language (Ex: not splitting python + functions in the middle) + """ if file_type in ["python", "cpp", "markdown"]: splitter = RecursiveCharacterTextSplitter.from_language( language=file_type, chunk_size=chunk_size, chunk_overlap=chunk_overlap diff --git a/applications/holochat_local/metadata.json b/applications/holochat_local/metadata.json new file mode 100644 index 000000000..ab73c2f05 --- /dev/null +++ b/applications/holochat_local/metadata.json @@ -0,0 +1,41 @@ +{ + "application": { + "name": "HoloChat-local", + "authors": [ + { + "name": "Nigel Nelson", + "affiliation": "NVIDIA" + } + ], + "language": "Python", + "version": "0.1.0", + "changelog": { + "0.1.0": "Beta release" + }, + "holoscan_sdk": { + "minimum_required_version": "0.6.0", + "tested_versions": [ + "0.6.0" + ] + }, + "platforms": ["amd64", "arm64"], + "tags": ["LLM", "Vector Database", "AI-Assistant"], + "ranking": 4, + "dependencies": { + "OSS": [ + { + "name": "Llama.cpp", + "version": "cf9b08485c4c2d4d945c6e74fe20f273a38b6104" + }, + { + "name": "LangChain", + "version": "0.0.277" + } + ] + }, + "run": { + "command": "make -C ./applications/holochat_local run_holochat", + "workdir": "holohub_bin" + } + } + } \ No newline at end of file diff --git a/applications/holochat_local/requirements.txt b/applications/holochat_local/requirements.txt new file mode 100644 index 000000000..98f830487 --- /dev/null +++ b/applications/holochat_local/requirements.txt @@ -0,0 +1,7 @@ +langchain==0.0.277 +sentence-transformers==2.2.2 +GitPython~=3.1 +gradio~=3.0 +pypdf~=3.12 +requests~=2.31 +chromadb==0.4.8 \ No newline at end of file