Skip to content

Commit

Permalink
Correct prompt string, update intro, add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
NigelNelson committed Sep 6, 2023
1 parent 87fe836 commit dfa6c54
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 37 deletions.
16 changes: 14 additions & 2 deletions applications/holochat_local/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Target that runs all commands in order to run HoloChat
.PHONY: run_holochat
run_holochat: build_llamaCpp build_db download_llama start_holochat

# Creates container used for HoloChat and compiles Llama.cpp
.PHONY: build_llamaCpp
build_llamaCpp:
docker build --ulimit memlock=-1 --ulimit stack=67108864 -t holochat .

# Creates the vector database used by HoloChat
.PHONY: build_db
build_db:
mkdir -p holochat/embeddings
Expand All @@ -36,20 +39,29 @@ build_db:
holochat \
python3 build_holoscan_db.py \

# Downloads the Llama-2 model used by HoloChat
.PHONY: download_llama
download_llama:
mkdir -p holochat/docs
wget -nc -P ./holochat/models https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v2-GGUF/resolve/main/phind-codellama-34b-v2.Q5_K_M.gguf

# Runs HoloChat inside the pytorch container
.PHONY: start_holochat
start_holochat:
docker run --rm -it \
-p 7860:7860 \
-p 8080:8080 \
--gpus all \
--ipc=host --ulimit memlock=-1 \
--ipc=host
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v ./holochat:/holochat \
-w /holochat \
holochat \
bash -c "/workspace/llama.cpp/build/bin/server -m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf --host 0.0.0.0 -ngl 1000 -c 4096 --alias llama_2 & python3 -u chatbot.py"
bash -c "/workspace/llama.cpp/build/bin/server \
-m /holochat/models/phind-codellama-34b-v2.Q5_K_M.gguf \
--host 0.0.0.0 \
-ngl 1000 \
-c 4096 \
--alias llama_2 \
& python3 -u chatbot.py"
2 changes: 1 addition & 1 deletion applications/holochat_local/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# HoloChat-local

git
HoloChat-local is an AI-driven chatbot, built on top of a locally hosted Code Llama model which acts as developer's copilot in Holoscan development. The Code Llama model leverages a vector database comprised of the Holoscan SDK repository and user guide, enabling HoloChat to answer general questions about Holoscan, as well act as a Holoscan SDK coding assistant.
<p align="center">
<kbd style="border: 2px solid black;">
<img src="holochat_demo.gif" alt="HoloChat Demo" />
Expand Down
5 changes: 5 additions & 0 deletions applications/holochat_local/holochat/build_holoscan_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
def main():
content_lists = {file_type: [] for file_type in file_types}
total_files = 0

# Loop over each repo and create a Document for each file found
for repo in repos:
clone_repository(repo, "")
for file_type in file_types:
Expand All @@ -53,6 +55,7 @@ def main():
)
)

# Loop over the user guide and create a Document for each page
content_lists[".pdf"] = []
for doc in docs:
loader = PyPDFLoader(doc)
Expand All @@ -77,6 +80,7 @@ def main():
Document(page_content=page_content, metadata={"userguide": doc})
)

# Dictionary used to map file type to language
ext_to_language = {
".py": "python",
".cpp": "cpp",
Expand All @@ -95,6 +99,7 @@ def main():
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity

# Create local embedding model cached at ./models
embedding_model = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
Expand Down
15 changes: 7 additions & 8 deletions applications/holochat_local/holochat/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Simple Gradio Chatbot app, for details visit:
# https://www.gradio.app/guides/creating-a-custom-chatbot-with-blocks

import gradio as gr
import sklearn
from llm import LLM
Expand All @@ -21,11 +24,6 @@

initial_prompt = "Welcome to HoloChat! How can I assist you today?"

python_prompt = """Create a Python Holoscan 'hello world' app with video
as input, use HoloViz to print 'Hello World' on each frame, and then output
it to the user. After the code explain the process step-by-step."""


def ask_question(message, chat_history):
if chat_history is None:
return "", [[None, initial_prompt]]
Expand All @@ -39,9 +37,7 @@ def ask_question(message, chat_history):
def stream_response(chat_history, llm):
if llm is None:
llm = LLM()

response = llm.answer_question(chat_history)

for chunk in response:
yield chunk, llm

Expand Down Expand Up @@ -101,7 +97,10 @@ def main():
["What operating system can I use with the Holoscan SDK?"],
["What hardware does Holoscan support?"],
["How do I create a C++ Holoscan Operator?"],
[python_prompt],
["Create a Python Holoscan 'hello world' app with video " \
"as input, use HoloViz to print 'Hello World' on each frame, " \
"and then output it to the user. After the code explain the " \
"process step-by-step."],
],
inputs=tbInput,
)
Expand Down
71 changes: 46 additions & 25 deletions applications/holochat_local/holochat/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,25 @@
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

# Llama-2 has context length of 4096 token
# 1 token = ~4 characters, so 3500 * 4 provides plenty of room.
# Most Llama-2 models are trained with a context length of 4096 tokens
# 1 token = ~4 characters, so 3300 * 4 provides plenty of room.
MAX_TOKENS = 3300 * 4
# Empirically found to be the cutoff of specific questions vs. generic comments about previous answer
# Empirically found to be the cutoff of a specific questions vs. generic comments about previous answer
# This ensures no documents are returned for comments such as "Rewrite that code in one block"
SEARCH_THRESHOLD = 0.35
NUM_HOLOSCAN_DOCS = 7
LLAMA_SERVER = "http://127.0.0.1:8080"
SERVER_TIMEOUT = 60 # seconds
SERVER_TIMEOUT = 60 # Timeout in seconds to connect to llama.cpp

system_prompt = """You are NVIDIA-GPT, an expert at all things NVIDIA who knows
the Holoscan user guide, as well as examples from Holohub and the api from the SDK.
You are an assistant who answers questions step-by-step and always provides your
reasoning so you have the correct result. Answer the questions based on the provided
context and augment with your general knowledge where appropriate. Reformat the provided
code examples as necessary since they were retrieved with a web scrape.
Under no circumstances will you make up Holoscan API functions or functionality that does not exist!
Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response with '</s>'.
Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:
"""
system_prompt = "You are NVIDIA-GPT, an expert at all things NVIDIA who knows " \
"the Holoscan user guide, as well as examples from Holohub and the api from the SDK. " \
"You are an assistant who answers questions step-by-step and always provides your " \
"reasoning so you have the correct result. Answer the questions based on the provided " \
"context and augment with your general knowledge where appropriate. Reformat the provided " \
"code examples as necessary since they were retrieved with a web scrape. " \
"Under no circumstances will you make up Holoscan API functions or functionality that does not " \
"exist! Do not conflate Holoscan Python API with Holoscan C++ API. You ALWAYS end your response " \
"with '</s>'. Below is NVIDIA Holoscan SDK documentation to assist you in answering questions:"


class LLM:
Expand All @@ -54,7 +54,7 @@ def answer_question(self, chat_history):
docs = self.db.similarity_search_with_score(
query=question, k=NUM_HOLOSCAN_DOCS, distance_metric="cos"
)
# Filter out poor matches
# Filter out poor matches from vector db
docs = list(
map(lambda lc_doc: lc_doc[0], filter(lambda lc_doc: lc_doc[1] < SEARCH_THRESHOLD, docs))
)
Expand All @@ -65,13 +65,15 @@ def answer_question(self, chat_history):
] # Get first docs (highest similarity score)
self.prev_docs = docs # Save document list

# Create a prompt to send to the llm (Remove greeting and question)
llama_prompt = _to_llama_prompt(chat_history[1:-1], question, docs)
response = self._stream_ai_response(llama_prompt, chat_history)

for chunk in response:
yield chunk

def _stream_ai_response(self, llama_prompt, chat_history):
# Llama-specific request data
request_data = {
"prompt": llama_prompt,
"temperature": 0,
Expand Down Expand Up @@ -99,34 +101,45 @@ def _get_database(self):
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True} # set True to compute cosine similarity

# Construct embedding model and cache to local './models' dir
embedding_model = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
cache_folder="./models",
)
# Use past two questions to get docs
chroma_db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embedding_model)

return chroma_db


def _to_llama_prompt(history, question, docs):
"""An attempt to mirror Alpaca-style prompting as closely as possible: https://github.com/arielnlee/Platypus/blob/main/templates/alpaca.json"""
"""
Function that takes the chat history, current question, and the documents
from vector db and create a single string to prompt the Llama model with
"""

# Phind v2's prompt prefixes (Note these are dependent on the model used)
user_prefix = "### User Message:"
bot_prefix = "### Assistant:"
bot_rule_prefix = "### System Prompt:"

opening_prompt = f"""Below is a chat between a user '{user_prefix}', and you, the AI
assistant '{bot_prefix}'. You follow the given rule '{bot_rule_prefix}' no matter what."""
# Explain the context of the information being provided
opening_prompt = f"Below is a chat between a user '{user_prefix}', and you, " \
"the AI assistant '{bot_prefix}'. You follow the given rule "\
"'{bot_rule_prefix}' no matter what."

# Combine all the vector db docs into a single string
docs = "\n\n".join(list(map(lambda lc_doc: lc_doc.page_content, docs)))
# Add the system prompt with the vector db docs
opening_prompt += f"\n\n{bot_rule_prefix}\n{system_prompt}\n\n{docs}"
ending_prompt = f"""\n\n{user_prefix}\nUsing the previous conversation history,
the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer
the following question (include markdown code snippets for coding questions and do not acknowledge
that documentation was provided to you):\n{question}"""
# Define the final portion of the prompt
ending_prompt = f"\n\n{user_prefix}\nUsing the previous conversation history, " \
"the provided NVIDIA Holoscan SDK documentation, AND your own expert knowledge, answer " \
"the following question (include markdown code snippets for coding questions and do not " \
f"acknowledge that documentation was provided to you):\n{question}"

# Loop over the chat history and convert it to a single string
msg_hist = ""
for msg_pair in history:
if msg_pair[0]:
Expand All @@ -136,23 +149,31 @@ def _to_llama_prompt(history, question, docs):

len_prompt = len(msg_hist) + len(opening_prompt) + len(ending_prompt)

# Remove previous conversation history if MAX_TOKENS exceeded
# Truncate previous conversation history if MAX_TOKENS exceeded
if len_prompt > MAX_TOKENS:
excess_tokens = len_prompt - MAX_TOKENS
msg_hist = msg_hist[excess_tokens:]
last_msg_idx = msg_hist.find("\n\n" + user_prefix)
bot_idx = msg_hist.find("\n\n" + bot_prefix)
# Truncate to the last user or bot message, which ever allows for a
# longer chat history
if bot_idx < last_msg_idx:
last_msg_idx = bot_idx
msg_hist = msg_hist[last_msg_idx:]

# Create the final prompt
prompt = opening_prompt + msg_hist + ending_prompt + f"\n\n{bot_prefix}\n"
print(prompt)
return prompt


def _wait_for_server():
"""
Method that attempts to connect to the llama.cpp server
for up to SERVER_TIMEOUT until throwing an exception
"""
attempts = 0
while attempts < SEARCH_THRESHOLD / 5:
while attempts < SERVER_TIMEOUT / 5:
try:
response = requests.get(LLAMA_SERVER)
# Check for a successful response status code (e.g., 200 OK)
Expand Down
12 changes: 11 additions & 1 deletion applications/holochat_local/holochat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@


def clone_repository(repo, token):
"""
Used to clone nvidia-holoscan repos
"""
print(f"Cloning repository: {repo}")
time.sleep(1)
try:
Expand All @@ -39,6 +42,9 @@ def clone_repository(repo, token):


def clone_general_repository(repo, token):
"""
Used to clone general repos
"""
print(f"Cloning repository: {repo}")
time.sleep(1)
try:
Expand Down Expand Up @@ -81,8 +87,12 @@ def get_files(files, type):
return contents


# langchain method can't handle 'disallowed_special' - use tiktoken for now
def get_source_chunks(all_contents, file_type=None, chunk_size=1500, chunk_overlap=150):
"""
Method that splits Documents into chunks for storage. If the language is supported,
it is split according to the syntax of that language (Ex: not splitting python
functions in the middle)
"""
if file_type in ["python", "cpp", "markdown"]:
splitter = RecursiveCharacterTextSplitter.from_language(
language=file_type, chunk_size=chunk_size, chunk_overlap=chunk_overlap
Expand Down
41 changes: 41 additions & 0 deletions applications/holochat_local/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"application": {
"name": "HoloChat-local",
"authors": [
{
"name": "Nigel Nelson",
"affiliation": "NVIDIA"
}
],
"language": "Python",
"version": "0.1.0",
"changelog": {
"0.1.0": "Beta release"
},
"holoscan_sdk": {
"minimum_required_version": "0.6.0",
"tested_versions": [
"0.6.0"
]
},
"platforms": ["amd64", "arm64"],
"tags": ["LLM", "Vector Database", "AI-Assistant"],
"ranking": 4,
"dependencies": {
"OSS": [
{
"name": "Llama.cpp",
"version": "cf9b08485c4c2d4d945c6e74fe20f273a38b6104"
},
{
"name": "LangChain",
"version": "0.0.277"
}
]
},
"run": {
"command": "make -C ./applications/holochat_local run_holochat",
"workdir": "holohub_bin"
}
}
}
7 changes: 7 additions & 0 deletions applications/holochat_local/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
langchain==0.0.277
sentence-transformers==2.2.2
GitPython~=3.1
gradio~=3.0
pypdf~=3.12
requests~=2.31
chromadb==0.4.8

0 comments on commit dfa6c54

Please sign in to comment.