Skip to content

Commit

Permalink
small fixes, dependencies update, option to save chats
Browse files Browse the repository at this point in the history
  • Loading branch information
aosan committed Jun 29, 2024
1 parent 82c66a6 commit e44969e
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 33 deletions.
5 changes: 1 addition & 4 deletions docs_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# Load environment variables
default_num_processes = os.getenv('DEFAULT_NUM_PROCESSES')
Expand Down Expand Up @@ -266,9 +266,6 @@ def main():
else:
db.add_documents(batch_texts)

if db is not None:
db.persist()

print(f"Documents are ready! You can now run vaultChat.py to query your model with your private documents")


Expand Down
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ chmod +x docs_loader.py vaultChat.py
# Final instructions for smart and good looking customers
printf "\n >>> Installation was successful!\n"
printf "\n >>> Run './docs_loader.py' to prepare your private data.\n"
printf "\n >>> Important! Run './docs_loader.py' again every time you change documents in the $SOURCE_DIRECTORY directory.\n"
printf "\n >>> Important! Run './docs_loader.py' again every time you change documents in your directory.\n"
printf "\n >>> Run './vaultChat.py' to start the application after your private data store creation or update.\n"

# That's all, folks
Expand Down
17 changes: 9 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
langchain==0.1.14
gpt4all==2.4.0
chromadb==0.4.24
PyMuPDF==1.24.1
langchain==0.2.6
gpt4all==2.7.0
chromadb==0.5.3
PyMuPDF==1.24.7
python-dotenv
extract-msg==0.48.5
tabulate==0.9.0
pandoc
pypandoc==1.13
tqdm==4.66.2
sentence_transformers==2.6.1
langchain_community==0.0.31
tqdm==4.66.4
sentence_transformers==3.0.1
langchain_community==0.2.6
langchain-huggingface==0.0.3
unstructured
markdown

psutil
82 changes: 62 additions & 20 deletions vaultChat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from dotenv import load_dotenv
from chromadb.config import Settings
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.vectorstores import Chroma
from langchain_community.llms import Ollama
import logging
from datetime import datetime

# Load environment variables from .env file
load_dotenv()
Expand All @@ -23,7 +24,6 @@
ANONYMIZE_TELEMETRY = os.getenv('ANONYMIZE_TELEMETRY', 'True') == 'True'
TARGET_SOURCE_CHUNKS = int(os.getenv('TARGET_SOURCE_CHUNKS', 5))


# Define anonymize telemetry for Chroma DB
client = chromadb.Client(Settings(anonymized_telemetry=ANONYMIZE_TELEMETRY))

Expand Down Expand Up @@ -54,45 +54,87 @@ def main():

def interactive_qa(qa, args, history):
# Usage instructions
print(f"\n\033[31;47m>>> Ready for private chat. Exit the session by typing 'exit' or '/bye'\033[0m")
"""Run interactive question and answer session."""
print(f"\n\033[31;47m>>> Ready for private chat. Exit the session by typing 'exit' or '/bye'. Save the chat by typing '/save <summary_name>'.\033[0m")
"""Run interactive question and answer sessions with your private data."""
while True:
query = input("\n\033[31;47m>>> Enter a question: \033[0m").strip().lower() # Normalize the input to handle case-insensitivity

if query in ["exit", "/bye"]:
print("Exiting. Goodbye!")
break
elif query.startswith("/save"):
save_chat_history(history, query)
continue
if not query:
continue

try:
start = time.time()
result = qa.invoke(query)
for output in qa_invoke_streaming(qa, query, history):
print(output, end='', flush=True)
end = time.time()

print_answer(result, query, args, start, end)
print(f"\n >>> Processing time: {end - start:.2f} seconds")
except Exception as e:
logging.error(f"Error processing query: {e}")

def print_answer(result, query, args, start, end):
"""Prints the answer and relevant sources with colored output for the source document indicator."""
answer, docs = result['result'], [] if args.hide_source else result['source_documents']
print(f"\n\n> Question: {query}\n{answer}")
# Color code for the source document line. You can change the color by modifying the ANSI code.
color_code = "\033[94m" # Bright blue color
reset_code = "\033[0m" # Resets the color to default
for document in docs:
# Apply color only to the line indicating the source document
source_line = f"{color_code}> {document.metadata['source']}{reset_code}"
print(f"\n{source_line}:\n{document.page_content}")
print(f"\n >>> Processing time: {end - start:.2f} seconds")
def qa_invoke_streaming(qa, query, history):
"""Invoke the QA system with streaming output."""
result = qa.invoke(query)
history.append(f"### Question: {query}\n{result['result']}\n")

yield f"\n\n> Question: {query}\n{result['result']}"

if 'source_documents' in result:
color_code = "\033[94m" # Bright blue color
reset_code = "\033[0m" # Resets the color to default
sources_text = ""
for document in result['source_documents']:
source_line = f"{color_code}> {document.metadata['source']}{reset_code}"
source_content = f"{source_line}:\n{document.page_content}"
sources_text += f"\n{source_content}"
yield f"\n{source_content}"
history[-1] += sources_text

# def save_chat_history(history, query):
# """Save the chat history to a markdown file."""
# try:
# _, summary_name = query.split(maxsplit=1)
# timestamp = datetime.now().strftime("%y%m%d%H%M")
# file_name = f"{timestamp}_{summary_name.replace(' ', '_').lower()}.md"
# with open(file_name, 'w') as f:
# f.write("# Chat History\n\n")
# f.writelines(history)
# print(f"Chat history saved as {file_name}")
# except ValueError:
# print("Invalid command. Use /save <summary_name>")


def save_chat_history(history, query):
"""Save the chat history to a markdown file in the 'chats_history' folder."""
try:
# Create 'chats_history' folder if non-existent
os.makedirs("chats_history", exist_ok=True)

_, summary_name = query.split(maxsplit=1)
timestamp = datetime.now().strftime("%y%m%d%H%M")
file_name = f"{timestamp}_{summary_name.replace(' ', '_').lower()}.md"
file_path = os.path.join("chats_history", file_name)

with open(file_path, 'w') as f:
f.write("# Chat History\n\n")
f.writelines(history)

print(f"Chat history saved as {file_path}")
except ValueError:
print("Invalid command. Use /save <summary_name>")


def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='VaultChat: Ask questions about your documents via a LLM.')
parser.add_argument("--hide-source", "-S", action='store_true', help='Disable printing of source documents used for answers.')
# Use --streaming flag to enable streaming StdOut callback
parser.add_argument("--streaming", action='store_true', help='Enable the streaming StdOut callback for LLMs.')
parser.add_argument("--streaming", action='store_true', help='Enable the streaming from LLMs.')
return parser.parse_args()

if __name__ == "__main__":
Expand Down

0 comments on commit e44969e

Please sign in to comment.