Correct prompt string, update intro, add comments

nvidia-holoscan · Sep 6, 2023 · dfa6c54 · dfa6c54
1 parent 87fe836
commit dfa6c54
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 37 deletions.
diff --git a/applications/holochat_local/Makefile b/applications/holochat_local/Makefile
@@ -13,13 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Target that runs all commands in order to run HoloChat
 .PHONY: run_holochat
 run_holochat: build_llamaCpp build_db download_llama start_holochat
 
+# Creates container used for HoloChat and compiles Llama.cpp
 .PHONY: build_llamaCpp
 build_llamaCpp:
 	docker build --ulimit memlock=-1 --ulimit stack=67108864 -t holochat .
 
+# Creates the vector database used by HoloChat
 .PHONY: build_db
 build_db:
 	mkdir -p holochat/embeddings
@@ -36,20 +39,29 @@ build_db:
 		holochat \
 		python3 build_holoscan_db.py \
 
+# Downloads the Llama-2 model used by HoloChat
 .PHONY: download_llama
 download_llama:
 	mkdir -p holochat/docs
 	wget -nc -P ./holochat/models https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf
 
+# Runs HoloChat inside the pytorch container
 .PHONY: start_holochat
 start_holochat:
 	docker run --rm -it \
 	-p 7860:7860 \
 	-p 8080:8080 \
 	--gpus all \
-	--ipc=host --ulimit memlock=-1 \
+	--ipc=host 
+	--ulimit memlock=-1 \
 	--ulimit stack=67108864 \
 	-v ./holochat:/holochat \
 	-w /holochat \
 	holochat \
-	bash -c "/workspace/llama.cpp/build/bin/server -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf --host 0.0.0.0 -ngl 1000 -c 4096 --alias llama_2 & python3 -u chatbot.py"
+	bash -c "/workspace/llama.cpp/build/bin/server \
+			 -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf \
+			 --host 0.0.0.0 \
+			 -ngl 1000 \
+			-c 4096 \
+			--alias llama_2 \
+			& python3 -u chatbot.py"
diff --git a/applications/holochat_local/README.md b/applications/holochat_local/README.md
@@ -1,6 +1,6 @@
 # HoloChat-local
 
-git 
+HoloChat-local is an AI-driven chatbot, built on top of a locally hosted Code Llama model which acts as developer's copilot in Holoscan development. The Code Llama model leverages a vector database comprised of the Holoscan SDK repository and user guide, enabling HoloChat to answer general questions about Holoscan, as well act as a Holoscan SDK coding assistant.
 <p align="center">
   <kbd style="border: 2px solid black;">
     <img src="holochat_demo.gif" alt="HoloChat Demo" />

diff --git a/applications/holochat_local/holochat/build_holoscan_db.py b/applications/holochat_local/holochat/build_holoscan_db.py
@@ -35,6 +35,8 @@
 def main():
     content_lists = {file_type: [] for file_type in file_types}
     total_files = 0
+
+    # Loop over each repo and create a Document for each file found
     for repo in repos:
         clone_repository(repo, "")
         for file_type in file_types:
@@ -53,6 +55,7 @@ def main():
                                 )
                             )
 
+    # Loop over the user guide and create a Document for each page
     content_lists[".pdf"] = []
     for doc in docs:
         loader = PyPDFLoader(doc)
@@ -77,6 +80,7 @@ def main():
                 Document(page_content=page_content, metadata={"userguide": doc})
             )
 
+    # Dictionary used to map file type to language
     ext_to_language = {
         ".py": "python",
         ".cpp": "cpp",
@@ -95,6 +99,7 @@ def main():
     model_kwargs = {"device": "cuda"}
     encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity
 
+    # Create local embedding model cached at ./models
     embedding_model = HuggingFaceBgeEmbeddings(
         model_name=model_name,
         model_kwargs=model_kwargs,

diff --git a/applications/holochat_local/holochat/chatbot.py b/applications/holochat_local/holochat/chatbot.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Simple Gradio Chatbot app, for details visit:
+# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks
+
 import gradio as gr
 import sklearn
 from llm import LLM
@@ -21,11 +24,6 @@
 
 initial_prompt = "Welcome to HoloChat! How can I assist you today?"
 
-python_prompt = """Create a Python Holoscan 'hello world' app with video
- as input, use HoloViz to print 'Hello World' on each frame, and then output
- it to the user. After the code explain the process step-by-step."""
-
-
 def ask_question(message, chat_history):
     if chat_history is None:
         return "", [[None, initial_prompt]]
@@ -39,9 +37,7 @@ def ask_question(message, chat_history):
 def stream_response(chat_history, llm):
     if llm is None:
         llm = LLM()
-
     response = llm.answer_question(chat_history)
-
     for chunk in response:
         yield chunk, llm
 
@@ -101,7 +97,10 @@ def main():
                         ["What operating system can I use with the Holoscan SDK?"],
                         ["What hardware does Holoscan support?"],
                         ["How do I create a C++ Holoscan Operator?"],
-                        [python_prompt],
+                        ["Create a Python Holoscan 'hello world' app with video " \
+                         "as input, use HoloViz to print 'Hello World' on each frame, " \
+                         "and then output it to the user. After the code explain the " \
+                         "process step-by-step."],
                     ],
                     inputs=tbInput,
                 )

diff --git a/applications/holochat_local/holochat/llm.py b/applications/holochat_local/holochat/llm.py
@@ -22,25 +22,25 @@
 from langchain.embeddings import HuggingFaceBgeEmbeddings
 from langchain.vectorstores import Chroma
 
-# Llama-2 has context length of 4096 token
-# 1 token = ~4 characters, so 3500 * 4 provides plenty of room.
+# Most Llama-2 models are trained with a context length of 4096 tokens
+# 1 token = ~4 characters, so 3300 * 4 provides plenty of room.
 MAX_TOKENS = 3300 * 4
-# Empirically found to be the cutoff of specific questions vs. generic comments about previous answer
+# Empirically found to be the cutoff of a specific questions vs. generic comments about previous answer
+# This ensures no documents are returned for comments such as "Rewrite that code in one block"
 SEARCH_THRESHOLD = 0.35
 NUM_HOLOSCAN_DOCS = 7
 LLAMA_SERVER = "http://127.0.0.1:8080"
-SERVER_TIMEOUT = 60  # seconds
+SERVER_TIMEOUT = 60  # Timeout in seconds to connect to llama.cpp
 
-system_prompt = """You are NVIDIA-GPT, an expert at all things NVIDIA who knows
- the Holoscan user guide, as well as examples from Holohub and the api from the SDK.
- You are an assistant who answers questions step-by-step and always provides your
- reasoning so you have the correct result. Answer the questions based on the provided
- context and augment with your general knowledge where appropriate. Reformat the provided
- code examples as necessary since they were retrieved with a web scrape. 
- Under no circumstances will you make up Holoscan API functions or functionality that does not exist!
- Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response with '</s>'.
- Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:
-"""
+system_prompt = "You are NVIDIA-GPT, an expert at all things NVIDIA who knows " \
+  "the Holoscan user guide, as well as examples from Holohub and the api from the SDK. " \
+  "You are an assistant who answers questions step-by-step and always provides your " \
+  "reasoning so you have the correct result. Answer the questions based on the provided " \
+  "context and augment with your general knowledge where appropriate. Reformat the provided " \
+  "code examples as necessary since they were retrieved with a web scrape. " \
+  "Under no circumstances will you make up Holoscan API functions or functionality that does not " \
+  "exist! Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response " \
+  "with '</s>'. Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:"
 
 
 class LLM:
@@ -54,7 +54,7 @@ def answer_question(self, chat_history):
         docs = self.db.similarity_search_with_score(
             query=question, k=NUM_HOLOSCAN_DOCS, distance_metric="cos"
         )
-        # Filter out poor matches
+        # Filter out poor matches from vector db
         docs = list(
             map(lambda lc_doc: lc_doc[0], filter(lambda lc_doc: lc_doc[1] < SEARCH_THRESHOLD, docs))
         )
@@ -65,13 +65,15 @@ def answer_question(self, chat_history):
             ]  # Get first docs (highest similarity score)
         self.prev_docs = docs  # Save document list
 
+        # Create a prompt to send to the llm (Remove greeting and question)
         llama_prompt = _to_llama_prompt(chat_history[1:-1], question, docs)
         response = self._stream_ai_response(llama_prompt, chat_history)
 
         for chunk in response:
             yield chunk
 
     def _stream_ai_response(self, llama_prompt, chat_history):
+        # Llama-specific request data
         request_data = {
             "prompt": llama_prompt,
             "temperature": 0,
@@ -99,34 +101,45 @@ def _get_database(self):
         model_kwargs = {"device": "cuda"}
         encode_kwargs = {"normalize_embeddings": True}  # set True to compute cosine similarity
 
+        # Construct embedding model and cache to local './models' dir
         embedding_model = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             model_kwargs=model_kwargs,
             encode_kwargs=encode_kwargs,
             cache_folder="./models",
         )
-        # Use past two questions to get docs
         chroma_db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embedding_model)
 
         return chroma_db
 
 
 def _to_llama_prompt(history, question, docs):
-    """An attempt to mirror Alpaca-style prompting as closely as possible: https://github.com/arielnlee/Platypus/blob/main/templates/alpaca.json"""
+    """
+    Function that takes the chat history, current question, and the documents
+    from vector db and create a single string to prompt the Llama model with
+    """
+
+    # Phind v2's prompt prefixes (Note these are dependent on the model used)
     user_prefix = "### User Message:"
     bot_prefix = "### Assistant:"
     bot_rule_prefix = "### System Prompt:"
 
-    opening_prompt = f"""Below is a chat between a user '{user_prefix}', and you, the AI
- assistant '{bot_prefix}'. You follow the given rule '{bot_rule_prefix}' no matter what."""
+    # Explain the context of the information being provided
+    opening_prompt = f"Below is a chat between a user '{user_prefix}', and you, " \
+                      "the AI assistant '{bot_prefix}'. You follow the given rule "\
+                      "'{bot_rule_prefix}' no matter what."
 
+    # Combine all the vector db docs into a single string
     docs = "\n\n".join(list(map(lambda lc_doc: lc_doc.page_content, docs)))
+    # Add the system prompt with the vector db docs
     opening_prompt += f"\n\n{bot_rule_prefix}\n{system_prompt}\n\n{docs}"
-    ending_prompt = f"""\n\n{user_prefix}\nUsing the previous conversation history,
- the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer
- the following question (include markdown code snippets for coding questions and do not acknowledge
- that documentation was provided to you):\n{question}"""
+    # Define the final portion of the prompt
+    ending_prompt = f"\n\n{user_prefix}\nUsing the previous conversation history, " \
+        "the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer " \
+        "the following question (include markdown code snippets for coding questions and do not " \
+        f"acknowledge that documentation was provided to you):\n{question}"
 
+    # Loop over the chat history and convert it to a single string
     msg_hist = ""
     for msg_pair in history:
         if msg_pair[0]:
@@ -136,23 +149,31 @@ def _to_llama_prompt(history, question, docs):
 
     len_prompt = len(msg_hist) + len(opening_prompt) + len(ending_prompt)
 
-    # Remove previous conversation history if MAX_TOKENS exceeded
+    # Truncate previous conversation history if MAX_TOKENS exceeded
     if len_prompt > MAX_TOKENS:
         excess_tokens = len_prompt - MAX_TOKENS
         msg_hist = msg_hist[excess_tokens:]
         last_msg_idx = msg_hist.find("\n\n" + user_prefix)
         bot_idx = msg_hist.find("\n\n" + bot_prefix)
+        # Truncate to the last user or bot message, which ever allows for a
+        # longer chat history
         if bot_idx < last_msg_idx:
             last_msg_idx = bot_idx
         msg_hist = msg_hist[last_msg_idx:]
 
+    # Create the final prompt
     prompt = opening_prompt + msg_hist + ending_prompt + f"\n\n{bot_prefix}\n"
+    print(prompt)
     return prompt
 
 
 def _wait_for_server():
+    """
+    Method that attempts to connect to the llama.cpp server
+    for up to SERVER_TIMEOUT until throwing an exception
+    """
     attempts = 0
-    while attempts < SEARCH_THRESHOLD / 5:
+    while attempts < SERVER_TIMEOUT / 5:
         try:
             response = requests.get(LLAMA_SERVER)
             # Check for a successful response status code (e.g., 200 OK)

diff --git a/applications/holochat_local/holochat/utils.py b/applications/holochat_local/holochat/utils.py
@@ -25,6 +25,9 @@
 
 
 def clone_repository(repo, token):
+    """
+    Used to clone nvidia-holoscan repos
+    """
     print(f"Cloning repository: {repo}")
     time.sleep(1)
     try:
@@ -39,6 +42,9 @@ def clone_repository(repo, token):
 
 
 def clone_general_repository(repo, token):
+    """
+    Used to clone general repos
+    """
     print(f"Cloning repository: {repo}")
     time.sleep(1)
     try:
@@ -81,8 +87,12 @@ def get_files(files, type):
     return contents
 
 
-# langchain method can't handle 'disallowed_special' - use tiktoken for now
 def get_source_chunks(all_contents, file_type=None, chunk_size=1500, chunk_overlap=150):
+    """
+    Method that splits Documents into chunks for storage. If the language is supported,
+    it is split according to the syntax of that language (Ex: not splitting python
+    functions in the middle)
+    """
     if file_type in ["python", "cpp", "markdown"]:
         splitter = RecursiveCharacterTextSplitter.from_language(
             language=file_type, chunk_size=chunk_size, chunk_overlap=chunk_overlap

diff --git a/applications/holochat_local/metadata.json b/applications/holochat_local/metadata.json
@@ -0,0 +1,41 @@
+{
+	"application": {
+			"name": "HoloChat-local",
+			"authors": [
+				{
+					"name": "Nigel Nelson",
+					"affiliation": "NVIDIA"
+				}
+			],
+			"language": "Python",
+			"version": "0.1.0",
+			"changelog": {
+				"0.1.0": "Beta release"
+			},
+			"holoscan_sdk": {
+				"minimum_required_version": "0.6.0",
+				"tested_versions": [
+					"0.6.0"
+				]
+			},
+			"platforms": ["amd64", "arm64"],
+			"tags": ["LLM", "Vector Database", "AI-Assistant"],
+			"ranking": 4,
+			"dependencies": {
+				"OSS": [
+					{
+						"name": "Llama.cpp",
+						"version": "cf9b08485c4c2d4d945c6e74fe20f273a38b6104"
+					},
+					{
+						"name": "LangChain",
+						"version": "0.0.277"
+					}
+			   ]
+			},
+			"run": {
+				"command": "make -C ./applications/holochat_local run_holochat",
+				"workdir": "holohub_bin"
+			}
+		}
+	}
diff --git a/applications/holochat_local/requirements.txt b/applications/holochat_local/requirements.txt
@@ -0,0 +1,7 @@
+langchain==0.0.277
+sentence-transformers==2.2.2
+GitPython~=3.1
+gradio~=3.0
+pypdf~=3.12
+requests~=2.31
+chromadb==0.4.8