From 8fe8f06a01971208f4f43e7f593b4a43674a617b Mon Sep 17 00:00:00 2001 From: Agnieszka Forajter <83956577+agnkuz@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:02:46 +0200 Subject: [PATCH] Update prompt --- main_rag.ipynb | 1811 +++++++++++++++++++++++++----------------------- 1 file changed, 932 insertions(+), 879 deletions(-) diff --git a/main_rag.ipynb b/main_rag.ipynb index dcac9b7..8ccc83c 100644 --- a/main_rag.ipynb +++ b/main_rag.ipynb @@ -1,889 +1,942 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": "# Wire RAG \"Open", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "id": "780d3dac3fa3a702" - }, - { - "cell_type": "markdown", - "source": [ - "## Setup" - ], - "metadata": { - "collapsed": false - }, - "id": "7d3909fd99fe64ed" - }, - { - "cell_type": "markdown", - "source": "Install dependencies for colab", - "metadata": { - "collapsed": false - }, - "id": "b4ef237387298e05" - }, - { - "cell_type": "code", - "execution_count": null, - "id": "L5aAdbvCNIME", - "metadata": { - "id": "L5aAdbvCNIME" - }, - "outputs": [], - "source": [ - "!pip install haystack-ai pinecone-haystack sentence-transformers pinecone transformers\n", - "!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/url_scraper.py\n", - "!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/github_scraper.py" - ] - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "Enter api keys", - "id": "d2ec40ad9144d8d6" - }, - { - "cell_type": "code", - "outputs": [], - "source": [ - "import os\n", - "import getpass\n", - "os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"pinecone api key\")\n", - "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"open ai api key\")\n", - "os.environ[\"GITHUB_API_TOKEN\"] = getpass.getpass(\"github api token (PAT)\")" - ], - "metadata": { - "id": "hgvsByZlcsN7", - "ExecuteTime": { - "end_time": "2024-09-23T08:16:51.569319Z", - "start_time": "2024-09-23T08:16:49.423772Z" - } - }, - "id": "hgvsByZlcsN7", - "execution_count": 6 - }, - { - "cell_type": "markdown", - "source": [ - "Or load keys from .env file" - ], - "metadata": { - "collapsed": false - }, - "id": "e56187b91fd48f4c" - }, - { - "cell_type": "code", - "source": [ - "from dotenv import load_dotenv\n", - "import os\n", - "load_dotenv()" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-10-01T09:27:34.727106Z", - "start_time": "2024-10-01T09:27:34.715223Z" - } - }, - "id": "c4ff7e3293da16da", - "outputs": [ + "cells": [ { - "data": { - "text/plain": [ - "True" + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "id": "b36f5fc7", - "metadata": { - "id": "b36f5fc7" - }, - "source": [ - "## Populate Pinecone Database" - ] - }, - { - "cell_type": "markdown", - "source": "### Scrape URLs\n", - "metadata": { - "collapsed": false - }, - "id": "f9debb5e3cf24851" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "Whitelist: Allow any URL that begins with any element from the white_list.\\\n", - "Blacklist: Block any URL that begins with any element from the black_list." - ], - "id": "507b685542fead85" - }, - { - "metadata": { - "id": "393ea1e7", - "ExecuteTime": { - "end_time": "2024-10-01T08:00:34.188137Z", - "start_time": "2024-10-01T08:00:13.067599Z" - } - }, - "cell_type": "code", - "source": [ - "import nest_asyncio\n", - "from utils.url_scraper import start_scraping\n", - "\n", - "# Apply the nest_asyncio patch to allow nested event loops in Jupyter\n", - "nest_asyncio.apply()\n", - "\n", - "starting_url = \"https://docs.wire.com\"\n", - "depth_limit = 2\n", - "\n", - "filter_list = {\"white_list\": [\"https://docs.wire.com\"], \"black_list\": []}\n", - "scraped_urls = await start_scraping(starting_url, depth_limit, filter_list)\n", - "\n", - "print(f\"Total URLs found: {len(scraped_urls)}\")" - ], - "id": "393ea1e7", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total URLs found: 429\n" - ] - } - ], - "execution_count": 2 - }, - { - "cell_type": "markdown", - "id": "b5a20874", - "metadata": { - "id": "b5a20874" - }, - "source": [ - "### Extract metadata and content" - ] - }, - { - "cell_type": "code", - "id": "8005a467", - "metadata": { - "id": "8005a467", - "ExecuteTime": { - "end_time": "2024-10-01T08:01:48.072118Z", - "start_time": "2024-10-01T08:00:43.421241Z" - } - }, - "source": [ - "from utils.url_scraper import extract_content_and_metadata, DATE_FORMATS, DATE_PATTERNS\n", - "\n", - "scraped_urls_dict = []\n", - "for u in scraped_urls:\n", - " scraped_urls_dict += extract_content_and_metadata(u, DATE_FORMATS, DATE_PATTERNS)\n", - "print(len(scraped_urls_dict))" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6679\n" - ] - } - ], - "execution_count": 3 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Scrape Github", - "id": "65d676deb188d49c" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "from utils.github_scraper import scrape_md_files\n", - "\n", - "md_dict = await scrape_md_files(org_name=\"wireapp\", api_key=os.getenv(\"GITHUB_API_TOKEN\"), repo_limit=None)\n", - "print(len(md_dict))" - ], - "id": "1892ddc2fc1323b5", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Save / Load .json", - "id": "48d1db0dbee9deab" - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-01T09:27:42.316090Z", - "start_time": "2024-10-01T09:27:42.153336Z" - } - }, - "cell_type": "code", - "source": [ - "import json\n", - "\n", - "# with open(\"./github_docs.json\", \"w\", encoding='utf-8') as json_file:\n", - "# json.dump(md_dict, json_file, ensure_ascii=False, indent=4)\n", - "# \n", - "# with open(\"./docs_wire.json\", \"w\", encoding='utf-8') as json_file:\n", - "# json.dump(scraped_urls_dict, json_file, ensure_ascii=False, indent=4)\n", - " \n", - "with open(\"./github_docs.json\", 'r', encoding='utf-8') as json_file:\n", - " md_dict = json.load(json_file)\n", - "print(len(md_dict), md_dict[0]['metadata'], sep='\\n')\n", - "\n", - "with open(\"./docs_wire.json\", 'r', encoding='utf-8') as json_file:\n", - " scraped_urls_dict = json.load(json_file)\n", - "print(len(scraped_urls_dict), scraped_urls_dict[0]['metadata'], sep='\\n')" - ], - "id": "be188a477f9defa1", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1366\n", - "{'url': 'https://github.com/wireapp/libsodium.js/blob/master/README.md', 'title': 'libsodium.js/README.md', 'headline': '', 'date': '2015-10-07'}\n", - "6679\n", - "{'url': 'https://docs.wire.com', 'title': 'Welcome to Wire’s documentation! — Wire 0.0.4 documentation', 'headline': 'Welcome to Wire’s documentation!\\uf0c1', 'date': 'Unknown'}\n" - ] - } - ], - "execution_count": 2 - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Populate database", - "id": "8e4fd58bcace53ee" - }, - { - "cell_type": "markdown", - "source": [ - "To delete all records u need to `pip install \"pinecone[grpc]\"` and run the following code." - ], - "metadata": { - "collapsed": false - }, - "id": "17ee68b0c460d0d6" - }, - { - "cell_type": "code", - "source": [ - "# Uncomment to delete all db records\n", - "# import os\n", - "# from pinecone import Pinecone\n", - "# Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\")).Index(\"wire-rag\").delete(delete_all=True, namespace='docs-wire')" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-10-01T09:27:23.635153Z", - "start_time": "2024-10-01T09:27:23.054147Z" - } - }, - "id": "eaa67c386a4cf554", - "outputs": [ + }, { - "data": { - "text/plain": [ - "{}" + "cell_type": "markdown", + "source": [ + "## Setup" + ], + "metadata": { + "collapsed": false, + "id": "7d3909fd99fe64ed" + }, + "id": "7d3909fd99fe64ed" + }, + { + "cell_type": "markdown", + "source": [ + "Install dependencies for colab" + ], + "metadata": { + "collapsed": false, + "id": "b4ef237387298e05" + }, + "id": "b4ef237387298e05" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "L5aAdbvCNIME", + "metadata": { + "id": "L5aAdbvCNIME" + }, + "outputs": [], + "source": [ + "!pip install haystack-ai pinecone-haystack sentence-transformers pinecone transformers\n", + "!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/url_scraper.py\n", + "!wget -P utils https://raw.githubusercontent.com/appunite/Wire-RAG/main/utils/github_scraper.py" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "cell_type": "markdown", - "source": [ - "Initialize Pinecone Document Store" - ], - "metadata": { - "collapsed": false - }, - "id": "e757de5fa7764edb" - }, - { - "cell_type": "code", - "id": "5142cfb8161abc1", - "metadata": { - "id": "5142cfb8161abc1", - "ExecuteTime": { - "end_time": "2024-10-01T09:28:11.252209Z", - "start_time": "2024-10-01T09:28:09.397726Z" - } - }, - "source": [ - "from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner\n", - "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", - "from haystack.components.writers import DocumentWriter\n", - "from haystack import Pipeline\n", - "from haystack import Document\n", - "from haystack_integrations.document_stores.pinecone import PineconeDocumentStore\n", - "\n", - "docs_wire_ds = PineconeDocumentStore(\n", - " index=\"wire-rag\",\n", - " namespace=\"docs-wire\",\n", - " dimension=384,\n", - " metric=\"cosine\",\n", - " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", - ")\n", - "\n", - "github_wireapp_ds = PineconeDocumentStore(\n", - " index=\"wire-rag\",\n", - " namespace=\"github-wireapp\",\n", - " dimension=384,\n", - " metric=\"cosine\",\n", - " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", - ")\n", - "\n", - "scraped_urls_documents = [Document(content=doc[\"content\"], meta=doc[\"metadata\"]) for doc in scraped_urls_dict]\n", - "print(f\"Scraped URLs documents: {len(scraped_urls_documents)}\")\n", - "\n", - "github_documents = [Document(content=doc[\"content\"], meta=doc[\"metadata\"]) for doc in md_dict]\n", - "print(f\"Github documents: {len(github_documents)}\")" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scraped URLs documents: 6679\n", - "Github documents: 1366\n" - ] - } - ], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "id": "3bfe34af87648a2a", - "metadata": { - "collapsed": false, - "id": "3bfe34af87648a2a" - }, - "source": "Create a pipelines to populate the Pinecone Document Store with both github and docs.wire documetns" - }, - { - "cell_type": "code", - "id": "f0df56147ec0fad8", - "metadata": { - "id": "f0df56147ec0fad8", - "ExecuteTime": { - "end_time": "2024-10-01T09:28:16.626519Z", - "start_time": "2024-10-01T09:28:16.620539Z" - } - }, - "source": [ - "# For all-MiniLM-L6-v2 default input text is 256 word pieces.\n", - "splitter_gh = DocumentSplitter(split_by=\"word\", split_length=256, split_overlap=20)\n", - "embedder_gh = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", - "writer_gh = DocumentWriter(github_wireapp_ds)\n", - "\n", - "pipeline_github = Pipeline()\n", - "pipeline_github.add_component(instance=splitter_gh, name=\"splitter_gh\")\n", - "pipeline_github.add_component(instance=embedder_gh, name=\"embedder_gh\")\n", - "pipeline_github.add_component(instance=writer_gh, name=\"writer_gh\")\n", - "\n", - "pipeline_github.connect(\"splitter_gh\", \"embedder_gh\")\n", - "pipeline_github.connect(\"embedder_gh\", \"writer_gh\")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "🚅 Components\n", - " - splitter_gh: DocumentSplitter\n", - " - embedder_gh: SentenceTransformersDocumentEmbedder\n", - " - writer_gh: DocumentWriter\n", - "🛤️ Connections\n", - " - splitter_gh.documents -> embedder_gh.documents (List[Document])\n", - " - embedder_gh.documents -> writer_gh.documents (List[Document])" + }, + { + "metadata": { + "id": "d2ec40ad9144d8d6" + }, + "cell_type": "markdown", + "source": [ + "Enter api keys" + ], + "id": "d2ec40ad9144d8d6" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"pinecone api key\")\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"open ai api key\")\n", + "os.environ[\"GITHUB_API_TOKEN\"] = getpass.getpass(\"github api token (PAT)\")" + ], + "metadata": { + "id": "hgvsByZlcsN7", + "ExecuteTime": { + "end_time": "2024-09-23T08:16:51.569319Z", + "start_time": "2024-09-23T08:16:49.423772Z" + } + }, + "id": "hgvsByZlcsN7", + "execution_count": null + }, + { + "cell_type": "markdown", + "source": [ + "Or load keys from .env file" + ], + "metadata": { + "collapsed": false, + "id": "e56187b91fd48f4c" + }, + "id": "e56187b91fd48f4c" + }, + { + "cell_type": "code", + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "load_dotenv()" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-01T09:27:34.727106Z", + "start_time": "2024-10-01T09:27:34.715223Z" + }, + "id": "c4ff7e3293da16da", + "outputId": "46fd518b-015c-433f-a2a8-ef84767e2ea8" + }, + "id": "c4ff7e3293da16da", + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "b36f5fc7", + "metadata": { + "id": "b36f5fc7" + }, + "source": [ + "## Populate Pinecone Database" ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 4 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-01T09:28:30.574364Z", - "start_time": "2024-10-01T09:28:30.565207Z" - } - }, - "cell_type": "code", - "source": [ - "cleaner_scraped = DocumentCleaner()\n", - "# For all-MiniLM-L6-v2 default input text is 256 word pieces.\n", - "splitter_scraped = DocumentSplitter(split_by=\"word\", split_length=256, split_overlap=20)\n", - "embedder_scraped = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", - "writer_scraped = DocumentWriter(docs_wire_ds)\n", - "\n", - "pipeline_scraped = Pipeline()\n", - "pipeline_scraped.add_component(instance=cleaner_scraped, name=\"cleaner_scraped\")\n", - "pipeline_scraped.add_component(instance=splitter_scraped, name=\"splitter_scraped\")\n", - "pipeline_scraped.add_component(instance=embedder_scraped, name=\"embedder_scraped\")\n", - "pipeline_scraped.add_component(instance=writer_scraped, name=\"writer_scraped\")\n", - "\n", - "pipeline_scraped.connect(\"cleaner_scraped\", \"splitter_scraped\")\n", - "pipeline_scraped.connect(\"splitter_scraped\", \"embedder_scraped\")\n", - "pipeline_scraped.connect(\"embedder_scraped\", \"writer_scraped\")" - ], - "id": "21543dec4ca93b9c", - "outputs": [ - { - "data": { - "text/plain": [ - "\n", - "🚅 Components\n", - " - cleaner_scraped: DocumentCleaner\n", - " - splitter_scraped: DocumentSplitter\n", - " - embedder_scraped: SentenceTransformersDocumentEmbedder\n", - " - writer_scraped: DocumentWriter\n", - "🛤️ Connections\n", - " - cleaner_scraped.documents -> splitter_scraped.documents (List[Document])\n", - " - splitter_scraped.documents -> embedder_scraped.documents (List[Document])\n", - " - embedder_scraped.documents -> writer_scraped.documents (List[Document])" + }, + { + "cell_type": "markdown", + "source": [ + "### Scrape URLs\n" + ], + "metadata": { + "collapsed": false, + "id": "f9debb5e3cf24851" + }, + "id": "f9debb5e3cf24851" + }, + { + "metadata": { + "id": "507b685542fead85" + }, + "cell_type": "markdown", + "source": [ + "Whitelist: Allow any URL that begins with any element from the white_list.\\\n", + "Blacklist: Block any URL that begins with any element from the black_list." + ], + "id": "507b685542fead85" + }, + { + "metadata": { + "id": "393ea1e7", + "ExecuteTime": { + "end_time": "2024-10-01T08:00:34.188137Z", + "start_time": "2024-10-01T08:00:13.067599Z" + }, + "outputId": "2a9c9155-bf24-472a-ed23-4cdadc70e30c" + }, + "cell_type": "code", + "source": [ + "import nest_asyncio\n", + "from utils.url_scraper import start_scraping\n", + "\n", + "# Apply the nest_asyncio patch to allow nested event loops in Jupyter\n", + "nest_asyncio.apply()\n", + "\n", + "starting_url = \"https://docs.wire.com\"\n", + "depth_limit = 2\n", + "\n", + "filter_list = {\"white_list\": [\"https://docs.wire.com\"], \"black_list\": []}\n", + "scraped_urls = await start_scraping(starting_url, depth_limit, filter_list)\n", + "\n", + "print(f\"Total URLs found: {len(scraped_urls)}\")" + ], + "id": "393ea1e7", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total URLs found: 429\n" + ] + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "b5a20874", + "metadata": { + "id": "b5a20874" + }, + "source": [ + "### Extract metadata and content" ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 5 - }, - { - "cell_type": "markdown", - "id": "c8fddbeb5665f26", - "metadata": { - "collapsed": false, - "id": "c8fddbeb5665f26" - }, - "source": [ - "Run the pipeline" - ] - }, - { - "cell_type": "code", - "id": "7a1c47cb85342d86", - "metadata": { - "id": "7a1c47cb85342d86" - }, - "source": [ - "pipeline_github.run(data = {\"splitter_gh\": { \"documents\" : github_documents }})\n", - "pipeline_scraped.run(data = {\"cleaner_scraped\": { \"documents\" : scraped_urls_documents }})\n", - "# preprocessing_pipeline.show()" - ], - "outputs": [], - "execution_count": null - }, - { - "cell_type": "markdown", - "id": "9eae0c4f1384253e", - "metadata": { - "collapsed": false, - "id": "9eae0c4f1384253e" - }, - "source": [ - "## Test RAG with Pinecone Document Store" - ] - }, - { - "cell_type": "markdown", - "id": "75a635b2d20f0142", - "metadata": { - "collapsed": false, - "id": "75a635b2d20f0142" - }, - "source": [ - "Restart the kernel and run the following code to test the RAG pipeline with the populated Pinecone Document Store.\\\n", - "Create pipeline to run a query" - ] - }, - { - "cell_type": "code", - "id": "1b38c8ef5aacfe0", - "metadata": { - "id": "1b38c8ef5aacfe0", - "ExecuteTime": { - "end_time": "2024-10-01T10:52:05.586737Z", - "start_time": "2024-10-01T10:52:03.820128Z" - } - }, - "source": [ - "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", - "from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever\n", - "from haystack.components.generators import OpenAIGenerator\n", - "from haystack.components.builders.answer_builder import AnswerBuilder\n", - "from haystack.components.builders.prompt_builder import PromptBuilder\n", - "from haystack.components.joiners.document_joiner import DocumentJoiner\n", - "from haystack import Pipeline\n", - "from haystack_integrations.document_stores.pinecone import PineconeDocumentStore\n", - "from dotenv import load_dotenv\n", - "\n", - "template = \"\"\"You are a knowledgeable assistant responsible for creating comprehensive documentation based on the following list of documents that refer to the user’s question. The content of these documents may contradict each other, so please prioritize the information from the documents with the most recent dates. If there are contradicting documents with dates labeled as 'None', provide all alternatives and explicitly indicate which parts contradict one another. However, if a document with date labeled as 'None' does not conflict with others, it should be included without special mention.\n", - "\n", - "Instructions:\n", - "2. Analyze the Documents:\n", - " - Review each document, noting any conflicting information.\n", - " - Prioritize information from the most recent documents.\n", - "3. Handling Documents with 'None' Date:\n", - " - If a document has a date marked as 'None':\n", - " - Include all relevant alternatives and clearly indicate contradictions.\n", - " - If it does not conflict with other documents, include it without special mention.\n", - "\n", - "Output Format:\n", - "Your output should be structured using Markdown and include the following sections:\n", - "1. Summary:\n", - " - Provide a brief overview of the key findings from all documents.\n", - "2. Detailed Analysis:\n", - " - Present detailed descriptions of key points, prioritizing the latest information.\n", - " - Preserve and format any code snippets from the documents appropriately.\n", - " - Present full semantic context retrieved from given documents.\n", - "3. Contradictions:\n", - " - For documents dated 'None', list all relevant alternatives and explicitly highlight any contradictions.\n", - " - Do not generate this section if there are no contradictions.\n", - "\n", - "General Guidelines:\n", - "- Ensure thoroughness by including all relevant information, aiming for completeness rather than brevity.\n", - "- Use headings, lists, and code blocks to enhance readability and organization.\n", - "- Given .md files should be the base structure of generated file. If .md files are poor, treat them as regular source.\n", - "\n", - "User Question: {{question}}\n", - "Documents to Analyze:\n", - "{% for doc in documents %}\n", - "Date: {{doc.meta['date']}}\n", - "Title: {{doc.meta['title']}} - {{doc.meta['headline']}}\n", - "Content: \n", - "{{doc.content}}\n", - "{% endfor %}\"\"\"\n", - "\n", - "load_dotenv()\n", - "\n", - "docs_wire_ds = PineconeDocumentStore(\n", - " index=\"wire-rag\",\n", - " namespace=\"docs-wire\",\n", - " dimension=384,\n", - " metric=\"cosine\",\n", - " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", - ")\n", - "\n", - "github_wireapp_ds = PineconeDocumentStore(\n", - " index=\"wire-rag\",\n", - " namespace=\"github-wireapp\",\n", - " dimension=384,\n", - " metric=\"cosine\",\n", - " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", - ")\n", - "\n", - "text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", - "retriever_docs_wire = PineconeEmbeddingRetriever(document_store=docs_wire_ds, top_k=15)\n", - "retriever_gh = PineconeEmbeddingRetriever(document_store=github_wireapp_ds, top_k=25)\n", - "joiner = DocumentJoiner(join_mode=\"concatenate\")\n", - "prompt_builder = PromptBuilder(template=template)\n", - "generator = OpenAIGenerator(model=\"gpt-4o-mini\") # \"gpt-4o-mini\" \"gpt-4o\" \"gpt-3.5-turbo\"\n", - "answer_builder = AnswerBuilder()\n", - "\n", - "rag_pipeline = Pipeline()\n", - "rag_pipeline.add_component(\"text_embedder\", text_embedder)\n", - "rag_pipeline.add_component(\"retriever_docs_wire\", retriever_docs_wire)\n", - "rag_pipeline.add_component(\"retriever_gh\", retriever_gh)\n", - "rag_pipeline.add_component(\"joiner\", joiner)\n", - "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", - "rag_pipeline.add_component(\"generator\", generator)\n", - "rag_pipeline.add_component(\"answer_builder\", answer_builder)\n", - "\n", - "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever_docs_wire.query_embedding\")\n", - "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever_gh.query_embedding\")\n", - "rag_pipeline.connect(\"retriever_docs_wire\", \"joiner\")\n", - "rag_pipeline.connect(\"retriever_gh\", \"joiner\")\n", - "\n", - "rag_pipeline.connect(\"joiner\", \"prompt_builder.documents\")\n", - "rag_pipeline.connect(\"joiner\", \"answer_builder.documents\")\n", - "rag_pipeline.connect(\"prompt_builder\", \"generator\")\n", - "rag_pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", - "\n", - "with open(\"./pipeline.yml\", \"w\") as file:\n", - " rag_pipeline.dump(file)" - ], - "outputs": [], - "execution_count": 1 - }, - { - "metadata": { - "collapsed": false, - "id": "f7c6c62a9fa033a1" - }, - "cell_type": "markdown", - "source": "Run the pipeline with a query", - "id": "f7c6c62a9fa033a1" - }, - { - "cell_type": "code", - "id": "d066fdea1ddd46d3", - "metadata": { - "id": "d066fdea1ddd46d3", - "ExecuteTime": { - "end_time": "2024-10-01T10:52:31.726464Z", - "start_time": "2024-10-01T10:52:11.924669Z" + }, + { + "cell_type": "code", + "id": "8005a467", + "metadata": { + "id": "8005a467", + "ExecuteTime": { + "end_time": "2024-10-01T08:01:48.072118Z", + "start_time": "2024-10-01T08:00:43.421241Z" + }, + "outputId": "c88db24a-0a1b-4544-d5c3-a1010a56c5f6" + }, + "source": [ + "from utils.url_scraper import extract_content_and_metadata, DATE_FORMATS, DATE_PATTERNS\n", + "\n", + "scraped_urls_dict = []\n", + "for u in scraped_urls:\n", + " scraped_urls_dict += extract_content_and_metadata(u, DATE_FORMATS, DATE_PATTERNS)\n", + "print(len(scraped_urls_dict))" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6679\n" + ] + } + ], + "execution_count": null + }, + { + "metadata": { + "id": "65d676deb188d49c" + }, + "cell_type": "markdown", + "source": [ + "### Scrape Github" + ], + "id": "65d676deb188d49c" + }, + { + "metadata": { + "id": "1892ddc2fc1323b5" + }, + "cell_type": "code", + "source": [ + "from utils.github_scraper import scrape_md_files\n", + "\n", + "md_dict = await scrape_md_files(org_name=\"wireapp\", api_key=os.getenv(\"GITHUB_API_TOKEN\"), repo_limit=None)\n", + "print(len(md_dict))" + ], + "id": "1892ddc2fc1323b5", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "id": "48d1db0dbee9deab" + }, + "cell_type": "markdown", + "source": [ + "### Save / Load .json" + ], + "id": "48d1db0dbee9deab" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-01T09:27:42.316090Z", + "start_time": "2024-10-01T09:27:42.153336Z" + }, + "id": "be188a477f9defa1", + "outputId": "29224fd9-9266-42f3-e4a4-e370e2595c93" + }, + "cell_type": "code", + "source": [ + "import json\n", + "\n", + "# with open(\"./github_docs.json\", \"w\", encoding='utf-8') as json_file:\n", + "# json.dump(md_dict, json_file, ensure_ascii=False, indent=4)\n", + "#\n", + "# with open(\"./docs_wire.json\", \"w\", encoding='utf-8') as json_file:\n", + "# json.dump(scraped_urls_dict, json_file, ensure_ascii=False, indent=4)\n", + "\n", + "with open(\"./github_docs.json\", 'r', encoding='utf-8') as json_file:\n", + " md_dict = json.load(json_file)\n", + "print(len(md_dict), md_dict[0]['metadata'], sep='\\n')\n", + "\n", + "with open(\"./docs_wire.json\", 'r', encoding='utf-8') as json_file:\n", + " scraped_urls_dict = json.load(json_file)\n", + "print(len(scraped_urls_dict), scraped_urls_dict[0]['metadata'], sep='\\n')" + ], + "id": "be188a477f9defa1", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1366\n", + "{'url': 'https://github.com/wireapp/libsodium.js/blob/master/README.md', 'title': 'libsodium.js/README.md', 'headline': '', 'date': '2015-10-07'}\n", + "6679\n", + "{'url': 'https://docs.wire.com', 'title': 'Welcome to Wire’s documentation! — Wire 0.0.4 documentation', 'headline': 'Welcome to Wire’s documentation!\\uf0c1', 'date': 'Unknown'}\n" + ] + } + ], + "execution_count": null + }, + { + "metadata": { + "id": "8e4fd58bcace53ee" + }, + "cell_type": "markdown", + "source": [ + "### Populate database" + ], + "id": "8e4fd58bcace53ee" + }, + { + "cell_type": "markdown", + "source": [ + "To delete all records u need to `pip install \"pinecone[grpc]\"` and run the following code." + ], + "metadata": { + "collapsed": false, + "id": "17ee68b0c460d0d6" + }, + "id": "17ee68b0c460d0d6" + }, + { + "cell_type": "code", + "source": [ + "# Uncomment to delete all db records\n", + "# import os\n", + "# from pinecone import Pinecone\n", + "# Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\")).Index(\"wire-rag\").delete(delete_all=True, namespace='docs-wire')" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-01T09:27:23.635153Z", + "start_time": "2024-10-01T09:27:23.054147Z" + }, + "id": "eaa67c386a4cf554", + "outputId": "c3586951-de48-4298-933e-7e69d4707d3d" + }, + "id": "eaa67c386a4cf554", + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "source": [ + "Initialize Pinecone Document Store" + ], + "metadata": { + "collapsed": false, + "id": "e757de5fa7764edb" + }, + "id": "e757de5fa7764edb" + }, + { + "cell_type": "code", + "id": "5142cfb8161abc1", + "metadata": { + "id": "5142cfb8161abc1", + "ExecuteTime": { + "end_time": "2024-10-01T09:28:11.252209Z", + "start_time": "2024-10-01T09:28:09.397726Z" + }, + "outputId": "6fa96f44-0a72-4e98-b720-289b885e9997" + }, + "source": [ + "from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner\n", + "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", + "from haystack.components.writers import DocumentWriter\n", + "from haystack import Pipeline\n", + "from haystack import Document\n", + "from haystack_integrations.document_stores.pinecone import PineconeDocumentStore\n", + "\n", + "docs_wire_ds = PineconeDocumentStore(\n", + " index=\"wire-rag\",\n", + " namespace=\"docs-wire\",\n", + " dimension=384,\n", + " metric=\"cosine\",\n", + " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", + ")\n", + "\n", + "github_wireapp_ds = PineconeDocumentStore(\n", + " index=\"wire-rag\",\n", + " namespace=\"github-wireapp\",\n", + " dimension=384,\n", + " metric=\"cosine\",\n", + " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", + ")\n", + "\n", + "scraped_urls_documents = [Document(content=doc[\"content\"], meta=doc[\"metadata\"]) for doc in scraped_urls_dict]\n", + "print(f\"Scraped URLs documents: {len(scraped_urls_documents)}\")\n", + "\n", + "github_documents = [Document(content=doc[\"content\"], meta=doc[\"metadata\"]) for doc in md_dict]\n", + "print(f\"Github documents: {len(github_documents)}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scraped URLs documents: 6679\n", + "Github documents: 1366\n" + ] + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "3bfe34af87648a2a", + "metadata": { + "collapsed": false, + "id": "3bfe34af87648a2a" + }, + "source": [ + "Create a pipelines to populate the Pinecone Document Store with both github and docs.wire documetns" + ] + }, + { + "cell_type": "code", + "id": "f0df56147ec0fad8", + "metadata": { + "id": "f0df56147ec0fad8", + "ExecuteTime": { + "end_time": "2024-10-01T09:28:16.626519Z", + "start_time": "2024-10-01T09:28:16.620539Z" + }, + "outputId": "fc8e369a-8a1e-4dfc-9e86-3a1bb1412bfc" + }, + "source": [ + "# For all-MiniLM-L6-v2 default input text is 256 word pieces.\n", + "splitter_gh = DocumentSplitter(split_by=\"word\", split_length=256, split_overlap=20)\n", + "embedder_gh = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "writer_gh = DocumentWriter(github_wireapp_ds)\n", + "\n", + "pipeline_github = Pipeline()\n", + "pipeline_github.add_component(instance=splitter_gh, name=\"splitter_gh\")\n", + "pipeline_github.add_component(instance=embedder_gh, name=\"embedder_gh\")\n", + "pipeline_github.add_component(instance=writer_gh, name=\"writer_gh\")\n", + "\n", + "pipeline_github.connect(\"splitter_gh\", \"embedder_gh\")\n", + "pipeline_github.connect(\"embedder_gh\", \"writer_gh\")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - splitter_gh: DocumentSplitter\n", + " - embedder_gh: SentenceTransformersDocumentEmbedder\n", + " - writer_gh: DocumentWriter\n", + "🛤️ Connections\n", + " - splitter_gh.documents -> embedder_gh.documents (List[Document])\n", + " - embedder_gh.documents -> writer_gh.documents (List[Document])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-01T09:28:30.574364Z", + "start_time": "2024-10-01T09:28:30.565207Z" + }, + "id": "21543dec4ca93b9c", + "outputId": "42b59621-0b87-473b-9a2d-5665b347a1b7" + }, + "cell_type": "code", + "source": [ + "cleaner_scraped = DocumentCleaner()\n", + "# For all-MiniLM-L6-v2 default input text is 256 word pieces.\n", + "splitter_scraped = DocumentSplitter(split_by=\"word\", split_length=256, split_overlap=20)\n", + "embedder_scraped = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "writer_scraped = DocumentWriter(docs_wire_ds)\n", + "\n", + "pipeline_scraped = Pipeline()\n", + "pipeline_scraped.add_component(instance=cleaner_scraped, name=\"cleaner_scraped\")\n", + "pipeline_scraped.add_component(instance=splitter_scraped, name=\"splitter_scraped\")\n", + "pipeline_scraped.add_component(instance=embedder_scraped, name=\"embedder_scraped\")\n", + "pipeline_scraped.add_component(instance=writer_scraped, name=\"writer_scraped\")\n", + "\n", + "pipeline_scraped.connect(\"cleaner_scraped\", \"splitter_scraped\")\n", + "pipeline_scraped.connect(\"splitter_scraped\", \"embedder_scraped\")\n", + "pipeline_scraped.connect(\"embedder_scraped\", \"writer_scraped\")" + ], + "id": "21543dec4ca93b9c", + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "🚅 Components\n", + " - cleaner_scraped: DocumentCleaner\n", + " - splitter_scraped: DocumentSplitter\n", + " - embedder_scraped: SentenceTransformersDocumentEmbedder\n", + " - writer_scraped: DocumentWriter\n", + "🛤️ Connections\n", + " - cleaner_scraped.documents -> splitter_scraped.documents (List[Document])\n", + " - splitter_scraped.documents -> embedder_scraped.documents (List[Document])\n", + " - embedder_scraped.documents -> writer_scraped.documents (List[Document])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "c8fddbeb5665f26", + "metadata": { + "collapsed": false, + "id": "c8fddbeb5665f26" + }, + "source": [ + "Run the pipeline" + ] + }, + { + "cell_type": "code", + "id": "7a1c47cb85342d86", + "metadata": { + "id": "7a1c47cb85342d86" + }, + "source": [ + "pipeline_github.run(data = {\"splitter_gh\": { \"documents\" : github_documents }})\n", + "pipeline_scraped.run(data = {\"cleaner_scraped\": { \"documents\" : scraped_urls_documents }})\n", + "# preprocessing_pipeline.show()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "9eae0c4f1384253e", + "metadata": { + "collapsed": false, + "id": "9eae0c4f1384253e" + }, + "source": [ + "## Test RAG with Pinecone Document Store" + ] + }, + { + "cell_type": "markdown", + "id": "75a635b2d20f0142", + "metadata": { + "collapsed": false, + "id": "75a635b2d20f0142" + }, + "source": [ + "Restart the kernel and run the following code to test the RAG pipeline with the populated Pinecone Document Store.\\\n", + "Create pipeline to run a query" + ] + }, + { + "cell_type": "code", + "id": "1b38c8ef5aacfe0", + "metadata": { + "id": "1b38c8ef5aacfe0", + "ExecuteTime": { + "end_time": "2024-10-01T10:52:05.586737Z", + "start_time": "2024-10-01T10:52:03.820128Z" + } + }, + "source": [ + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.builders.answer_builder import AnswerBuilder\n", + "from haystack.components.builders.prompt_builder import PromptBuilder\n", + "from haystack.components.joiners.document_joiner import DocumentJoiner\n", + "from haystack import Pipeline\n", + "from haystack_integrations.document_stores.pinecone import PineconeDocumentStore\n", + "from dotenv import load_dotenv\n", + "\n", + "template = \"\"\"You are a knowledgeable assistant responsible for creating comprehensive documentation based on the following list of documents that refer to the user’s question. The content of these documents may contradict each other, so please prioritize the information from the documents with the most recent dates. If there are contradicting documents with dates labeled as 'None', provide all alternatives and explicitly indicate which parts contradict one another. However, if a document with date labeled as 'None' does not conflict with others, it should be included without special mention.\n", + "\n", + "Instructions:\n", + "2. Analyze the Documents:\n", + " - Review each document, noting any conflicting information.\n", + " - Prioritize information from the most recent documents.\n", + "3. Handling Documents with 'None' Date:\n", + " - If a document has a date marked as 'None':\n", + " - Include all relevant alternatives and clearly indicate contradictions.\n", + " - If it does not conflict with other documents, include it without special mention.\n", + "\n", + "Output Format:\n", + "Your output should be structured using Markdown and include the following sections:\n", + "1. Summary:\n", + " - Provide a brief overview of the key findings from all documents.\n", + "2. Detailed Analysis:\n", + " - Present detailed descriptions of key points, prioritizing the latest information.\n", + " - Preserve and format any code snippets from the documents appropriately.\n", + " - Present full semantic context retrieved from given documents.\n", + " - Be descriptive and exhaustive, don't skip any relevant information found.\n", + " - Use only the data given to you, not rely on your knowledge.\n", + "3. Contradictions:\n", + " - For documents dated 'None', list all relevant alternatives and explicitly highlight any contradictions.\n", + " - Do not generate this section if there are no contradictions.\n", + "\n", + "General Guidelines:\n", + "- Ensure thoroughness by including all relevant information, aiming for completeness rather than brevity.\n", + "- Use headings, lists, and code blocks to enhance readability and organization.\n", + "- Given .md files should be the base structure of generated file. If .md files are poor, treat them as regular source.\n", + "\n", + "User Question: {{question}}\n", + "Documents to Analyze:\n", + "{% for doc in documents %}\n", + "Date: {{doc.meta['date']}}\n", + "Title: {{doc.meta['title']}} - {{doc.meta['headline']}}\n", + "Content:\n", + "{{doc.content}}\n", + "{% endfor %}\"\"\"\n", + "\n", + "load_dotenv()\n", + "\n", + "docs_wire_ds = PineconeDocumentStore(\n", + " index=\"wire-rag\",\n", + " namespace=\"docs-wire\",\n", + " dimension=384,\n", + " metric=\"cosine\",\n", + " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", + ")\n", + "\n", + "github_wireapp_ds = PineconeDocumentStore(\n", + " index=\"wire-rag\",\n", + " namespace=\"github-wireapp\",\n", + " dimension=384,\n", + " metric=\"cosine\",\n", + " spec={\"serverless\": {\"region\": \"us-east-1\", \"cloud\": \"aws\"}}\n", + ")\n", + "\n", + "text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "retriever_docs_wire = PineconeEmbeddingRetriever(document_store=docs_wire_ds, top_k=15)\n", + "retriever_gh = PineconeEmbeddingRetriever(document_store=github_wireapp_ds, top_k=25)\n", + "joiner = DocumentJoiner(join_mode=\"concatenate\")\n", + "prompt_builder = PromptBuilder(template=template)\n", + "generator = OpenAIGenerator(model=\"gpt-4o-mini\") # \"gpt-4o-mini\" \"gpt-4o\" \"gpt-3.5-turbo\"\n", + "answer_builder = AnswerBuilder()\n", + "\n", + "rag_pipeline = Pipeline()\n", + "rag_pipeline.add_component(\"text_embedder\", text_embedder)\n", + "rag_pipeline.add_component(\"retriever_docs_wire\", retriever_docs_wire)\n", + "rag_pipeline.add_component(\"retriever_gh\", retriever_gh)\n", + "rag_pipeline.add_component(\"joiner\", joiner)\n", + "rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", + "rag_pipeline.add_component(\"generator\", generator)\n", + "rag_pipeline.add_component(\"answer_builder\", answer_builder)\n", + "\n", + "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever_docs_wire.query_embedding\")\n", + "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever_gh.query_embedding\")\n", + "rag_pipeline.connect(\"retriever_docs_wire\", \"joiner\")\n", + "rag_pipeline.connect(\"retriever_gh\", \"joiner\")\n", + "\n", + "rag_pipeline.connect(\"joiner\", \"prompt_builder.documents\")\n", + "rag_pipeline.connect(\"joiner\", \"answer_builder.documents\")\n", + "rag_pipeline.connect(\"prompt_builder\", \"generator\")\n", + "rag_pipeline.connect(\"generator.replies\", \"answer_builder.replies\")\n", + "\n", + "with open(\"./pipeline.yml\", \"w\") as file:\n", + " rag_pipeline.dump(file)" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "collapsed": false, + "id": "f7c6c62a9fa033a1" + }, + "cell_type": "markdown", + "source": [ + "Run the pipeline with a query" + ], + "id": "f7c6c62a9fa033a1" + }, + { + "cell_type": "code", + "id": "d066fdea1ddd46d3", + "metadata": { + "id": "d066fdea1ddd46d3", + "ExecuteTime": { + "end_time": "2024-10-01T10:52:31.726464Z", + "start_time": "2024-10-01T10:52:11.924669Z" + }, + "outputId": "b46bd6bd-073b-4d0b-f963-b5828662b5ff", + "colab": { + "referenced_widgets": [ + "b7c1880341b84a80ab6b184ff6d821c8" + ] + } + }, + "source": [ + "query = \"Generate complete documentation of Legal Hold\"\n", + "result = rag_pipeline.run({\n", + " \"text_embedder\": {\"text\": query},\n", + " \"prompt_builder\": {\"question\": query},\n", + " \"answer_builder\": {\"query\": query}\n", + "})\n", + "\n", + "print(result['answer_builder']['answers'][0].query)\n", + "print(result['answer_builder']['answers'][0].data, end='\\n\\n\\n')\n", + "for i, doc in enumerate(result['answer_builder']['answers'][0].documents):\n", + " print(f\"{i + 1}. {doc.meta['title']} {doc.meta['headline']} - {doc.to_dict()['url']}\")\n", + "\n", + "with open(\"./output.md\", \"w\") as f:\n", + " f.write(result['answer_builder']['answers'][0].data)" + ], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/filip/Documents/.venvs/haystack/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "Batches: 0%| | 0/1 [00:00';\n", + " ```\n", + "\n", + "3. **Create Legal Hold Database**:\n", + " ```sql\n", + " CREATE DATABASE legalhold;\n", + " ```\n", + "\n", + "4. **Configure and Run Legal Hold Service**:\n", + " Ensure Docker is installed and use a random secret token for configuration.\n", + " ```bash\n", + " docker run -e DB_URL= \n", + " ```\n", + "\n", + "5. **Configure DNS**: \n", + " Point a subdomain (e.g., `legal.`) to the Legal Hold service.\n", + "\n", + "### User Consent and Status Management\n", + "Before a user can have a legal hold device added to their account, consent is mandatory:\n", + "- Users receive a prompt for their consent before being subjected to legal holds. Only upon granting consent can the legal hold devices be assigned.\n", + "\n", + "#### API Endpoints for Legal Hold Operations\n", + "- **Request a user to be put under legal hold**:\n", + " ```http\n", + " POST /teams/{tid}/legalhold/{uid}\n", + " ```\n", + " Responds with `201 Created` if successful.\n", + "\n", + "- **User Approval Process**:\n", + " ```http\n", + " PUT /teams/{tid}/legalhold/{uid}/approve\n", + " {\n", + " \"password\": \"\" // optional for password-less users\n", + " }\n", + " ```\n", + " Responds with `200 OK`.\n", + "\n", + "- **Deletion of Legal Hold by Admin**:\n", + " ```http\n", + " DELETE /teams/{tid}/legalhold/{uid}\n", + " {\n", + " \"password\": \"\" // optional for password-less admins\n", + " }\n", + " ```\n", + " Responds with `200 OK`.\n", + "\n", + "- **Get Legal Hold Status**:\n", + " ```http\n", + " GET /team/{tid}/members\n", + " ```\n", + " The response includes a `legalhold_status` field indicating if legal hold is enabled or disabled for team members.\n", + "\n", + "### Events Related to Legal Holds\n", + "Various events are triggered in response to actions regarding legal holds:\n", + "- New legal hold request:\n", + " ```json\n", + " { \"type\": \"user.legalhold-request\", \"id\": UserID, ... }\n", + " ```\n", + "- Legal hold enabled:\n", + " ```json\n", + " { \"type\": \"user.legalhold-enable\", \"id\": UserID }\n", + " ```\n", + "- Legal hold disabled:\n", + " ```json\n", + " { \"type\": \"user.legalhold-disable\", \"id\": UserID }\n", + " ```\n", + "\n", + "## Contradictions\n", + "While analyzing the documents, certain elements mentioned in different documents may appear repetitive rather than directly contradictory. The latest documentation dated 2023-01-25 aligns consistently in the following aspects:\n", + "\n", + "### Key Contradictions:\n", + "1. **Device Management**:\n", + " - The notion of adding legal hold devices is consistent: team admins can add devices only with user consent. However, in different documents, the way in which this user consent is sought or presented may vary slightly.\n", + "\n", + "2. **User Notification**:\n", + " - Some documents state explicit methods (e.g., red dots on the UI) to indicate active legal holds, whereas others imply general notifications without specifying UI elements.\n", + "\n", + "3. **Database and Service Installation**:\n", + " - Various undocumented steps such as the necessity of Docker installations or specific command structures were echoed throughout; however, the latest version is the go-to source for accurate instructions.\n", + "\n", + "By ensuring that programmers and users follow the details in the 2023-01-25 documentation, potential confusion created by outdated or varying descriptions in documents without dates should be minimized. \n", + "\n", + "This document consolidates comprehensive legal hold information and provides a relevant view for its implementation, while also highlighting any inconsistencies that could affect usability and comprehension.\n", + "\n", + "\n", + "1. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", + "2. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", + "3. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#introduction\n", + "4. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html\n", + "5. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", + "6. wire-server/docs/src/understand/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/understand/legalhold.md\n", + "7. Legal hold — Wire 0.0.4 documentation Legal hold - https://docs.wire.com/developer/reference/team/legalhold.html\n", + "8. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", + "9. Legal hold — Wire 0.0.4 documentation API and flows - https://docs.wire.com/developer/reference/team/legalhold.html\n", + "10. Legal hold — Wire 0.0.4 documentation Events - https://docs.wire.com/developer/reference/team/legalhold.html\n", + "11. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/balanced-match/LICENSE.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/balanced-match/LICENSE.md\n", + "12. github-action-wire-messenger/node_modules/logdown/license.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/logdown/license.md\n", + "13. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/brace-expansion/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/brace-expansion/README.md\n", + "14. dependency-track/CONTRIBUTING.md - https://github.com/wireapp/dependency-track/blob/master/CONTRIBUTING.md\n", + "15. uniffi-rs/weedle2/LICENSE.md - https://github.com/wireapp/uniffi-rs/blob/main/weedle2/LICENSE.md\n", + "16. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html\n", + "17. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", + "18. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", + "19. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#introduction\n", + "20. github-action-wire-messenger/node_modules/psl/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/psl/README.md\n", + "21. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/balanced-match/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/balanced-match/README.md\n", + "22. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", + "23. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html\n", + "24. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#introduction\n", + "25. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", + "26. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", + "27. wire-server/docs/README.md - https://github.com/wireapp/wire-server/blob/develop/docs/README.md\n", + "28. openmls/book/src/release_management.md - https://github.com/wireapp/openmls/blob/wire/stable/book/src/release_management.md\n", + "29. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/fast-levenshtein/LICENSE.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/fast-levenshtein/LICENSE.md\n", + "30. servant/servant-server/example/README.md - https://github.com/wireapp/servant/blob/master/servant-server/example/README.md\n", + "31. openmls/CONTRIBUTING.md - https://github.com/wireapp/openmls/blob/wire/stable/CONTRIBUTING.md\n", + "32. github-action-wire-messenger/node_modules/http-status-codes/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/http-status-codes/README.md\n", + "33. wire-android/docs/adr/0001-record-architecture-decisions.md - https://github.com/wireapp/wire-android/blob/develop/docs/adr/0001-record-architecture-decisions.md\n", + "34. openmls/book/src/release_management.md - https://github.com/wireapp/openmls/blob/wire/stable/book/src/release_management.md\n", + "35. rust-pki/certval/CHANGELOG.md - https://github.com/wireapp/rust-pki/blob/wire/stable/certval/CHANGELOG.md\n", + "36. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/escodegen/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/escodegen/README.md\n", + "37. uniffi-rs/docs/adr/template.md - https://github.com/wireapp/uniffi-rs/blob/main/docs/adr/template.md\n", + "38. ansible-minio/LICENSE.md - https://github.com/wireapp/ansible-minio/blob/master/LICENSE.md\n", + "39. wire-android/docs/adr/0000-template-lightway-adr.md - https://github.com/wireapp/wire-android/blob/develop/docs/adr/0000-template-lightway-adr.md\n", + "40. re/README.md - https://github.com/wireapp/re/blob/master/README.md\n" + ] + } + ], + "execution_count": null } - }, - "source": [ - "query = \"Generate complete documentation of Legal Hold\"\n", - "result = rag_pipeline.run({\n", - " \"text_embedder\": {\"text\": query},\n", - " \"prompt_builder\": {\"question\": query},\n", - " \"answer_builder\": {\"query\": query}\n", - "})\n", - "\n", - "print(result['answer_builder']['answers'][0].query)\n", - "print(result['answer_builder']['answers'][0].data, end='\\n\\n\\n')\n", - "for i, doc in enumerate(result['answer_builder']['answers'][0].documents):\n", - " print(f\"{i + 1}. {doc.meta['title']} {doc.meta['headline']} - {doc.to_dict()['url']}\")\n", - "\n", - "with open(\"./output.md\", \"w\") as f:\n", - " f.write(result['answer_builder']['answers'][0].data)" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/filip/Documents/.venvs/haystack/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "Batches: 0%| | 0/1 [00:00';\n", - " ```\n", - "\n", - "3. **Create Legal Hold Database**:\n", - " ```sql\n", - " CREATE DATABASE legalhold;\n", - " ```\n", - "\n", - "4. **Configure and Run Legal Hold Service**:\n", - " Ensure Docker is installed and use a random secret token for configuration.\n", - " ```bash\n", - " docker run -e DB_URL= \n", - " ```\n", - "\n", - "5. **Configure DNS**: \n", - " Point a subdomain (e.g., `legal.`) to the Legal Hold service.\n", - "\n", - "### User Consent and Status Management\n", - "Before a user can have a legal hold device added to their account, consent is mandatory:\n", - "- Users receive a prompt for their consent before being subjected to legal holds. Only upon granting consent can the legal hold devices be assigned.\n", - "\n", - "#### API Endpoints for Legal Hold Operations\n", - "- **Request a user to be put under legal hold**:\n", - " ```http\n", - " POST /teams/{tid}/legalhold/{uid}\n", - " ```\n", - " Responds with `201 Created` if successful.\n", - "\n", - "- **User Approval Process**:\n", - " ```http\n", - " PUT /teams/{tid}/legalhold/{uid}/approve\n", - " {\n", - " \"password\": \"\" // optional for password-less users\n", - " }\n", - " ```\n", - " Responds with `200 OK`.\n", - "\n", - "- **Deletion of Legal Hold by Admin**:\n", - " ```http\n", - " DELETE /teams/{tid}/legalhold/{uid}\n", - " {\n", - " \"password\": \"\" // optional for password-less admins\n", - " }\n", - " ```\n", - " Responds with `200 OK`.\n", - "\n", - "- **Get Legal Hold Status**:\n", - " ```http\n", - " GET /team/{tid}/members\n", - " ```\n", - " The response includes a `legalhold_status` field indicating if legal hold is enabled or disabled for team members.\n", - "\n", - "### Events Related to Legal Holds\n", - "Various events are triggered in response to actions regarding legal holds:\n", - "- New legal hold request:\n", - " ```json\n", - " { \"type\": \"user.legalhold-request\", \"id\": UserID, ... }\n", - " ```\n", - "- Legal hold enabled:\n", - " ```json\n", - " { \"type\": \"user.legalhold-enable\", \"id\": UserID }\n", - " ```\n", - "- Legal hold disabled:\n", - " ```json\n", - " { \"type\": \"user.legalhold-disable\", \"id\": UserID }\n", - " ```\n", - "\n", - "## Contradictions\n", - "While analyzing the documents, certain elements mentioned in different documents may appear repetitive rather than directly contradictory. The latest documentation dated 2023-01-25 aligns consistently in the following aspects:\n", - "\n", - "### Key Contradictions:\n", - "1. **Device Management**:\n", - " - The notion of adding legal hold devices is consistent: team admins can add devices only with user consent. However, in different documents, the way in which this user consent is sought or presented may vary slightly.\n", - "\n", - "2. **User Notification**:\n", - " - Some documents state explicit methods (e.g., red dots on the UI) to indicate active legal holds, whereas others imply general notifications without specifying UI elements.\n", - "\n", - "3. **Database and Service Installation**:\n", - " - Various undocumented steps such as the necessity of Docker installations or specific command structures were echoed throughout; however, the latest version is the go-to source for accurate instructions.\n", - "\n", - "By ensuring that programmers and users follow the details in the 2023-01-25 documentation, potential confusion created by outdated or varying descriptions in documents without dates should be minimized. \n", - "\n", - "This document consolidates comprehensive legal hold information and provides a relevant view for its implementation, while also highlighting any inconsistencies that could affect usability and comprehension.\n", - "\n", - "\n", - "1. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", - "2. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", - "3. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#introduction\n", - "4. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html\n", - "5. Installing and setting up Legal Hold — Wire 0.0.4 documentation Introduction - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", - "6. wire-server/docs/src/understand/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/understand/legalhold.md\n", - "7. Legal hold — Wire 0.0.4 documentation Legal hold - https://docs.wire.com/developer/reference/team/legalhold.html\n", - "8. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", - "9. Legal hold — Wire 0.0.4 documentation API and flows - https://docs.wire.com/developer/reference/team/legalhold.html\n", - "10. Legal hold — Wire 0.0.4 documentation Events - https://docs.wire.com/developer/reference/team/legalhold.html\n", - "11. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/balanced-match/LICENSE.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/balanced-match/LICENSE.md\n", - "12. github-action-wire-messenger/node_modules/logdown/license.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/logdown/license.md\n", - "13. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/brace-expansion/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/brace-expansion/README.md\n", - "14. dependency-track/CONTRIBUTING.md - https://github.com/wireapp/dependency-track/blob/master/CONTRIBUTING.md\n", - "15. uniffi-rs/weedle2/LICENSE.md - https://github.com/wireapp/uniffi-rs/blob/main/weedle2/LICENSE.md\n", - "16. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html\n", - "17. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", - "18. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", - "19. Installing and setting up Legal Hold — Wire 0.0.4 documentation Installing Legal Hold - https://docs.wire.com/understand/legalhold.html#introduction\n", - "20. github-action-wire-messenger/node_modules/psl/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/psl/README.md\n", - "21. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/balanced-match/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/balanced-match/README.md\n", - "22. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#configuring-team-settings-to-use-legal-hold\n", - "23. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html\n", - "24. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#introduction\n", - "25. Installing and setting up Legal Hold — Wire 0.0.4 documentation Configuring Team Settings to use Legal Hold - https://docs.wire.com/understand/legalhold.html#installing-legal-hold\n", - "26. wire-server/docs/src/developer/reference/team/legalhold.md - https://github.com/wireapp/wire-server/blob/develop/docs/src/developer/reference/team/legalhold.md\n", - "27. wire-server/docs/README.md - https://github.com/wireapp/wire-server/blob/develop/docs/README.md\n", - "28. openmls/book/src/release_management.md - https://github.com/wireapp/openmls/blob/wire/stable/book/src/release_management.md\n", - "29. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/fast-levenshtein/LICENSE.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/fast-levenshtein/LICENSE.md\n", - "30. servant/servant-server/example/README.md - https://github.com/wireapp/servant/blob/master/servant-server/example/README.md\n", - "31. openmls/CONTRIBUTING.md - https://github.com/wireapp/openmls/blob/wire/stable/CONTRIBUTING.md\n", - "32. github-action-wire-messenger/node_modules/http-status-codes/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/http-status-codes/README.md\n", - "33. wire-android/docs/adr/0001-record-architecture-decisions.md - https://github.com/wireapp/wire-android/blob/develop/docs/adr/0001-record-architecture-decisions.md\n", - "34. openmls/book/src/release_management.md - https://github.com/wireapp/openmls/blob/wire/stable/book/src/release_management.md\n", - "35. rust-pki/certval/CHANGELOG.md - https://github.com/wireapp/rust-pki/blob/wire/stable/certval/CHANGELOG.md\n", - "36. github-action-wire-messenger/node_modules/protobufjs/cli/node_modules/escodegen/README.md - https://github.com/wireapp/github-action-wire-messenger/blob/main/node_modules/protobufjs/cli/node_modules/escodegen/README.md\n", - "37. uniffi-rs/docs/adr/template.md - https://github.com/wireapp/uniffi-rs/blob/main/docs/adr/template.md\n", - "38. ansible-minio/LICENSE.md - https://github.com/wireapp/ansible-minio/blob/master/LICENSE.md\n", - "39. wire-android/docs/adr/0000-template-lightway-adr.md - https://github.com/wireapp/wire-android/blob/develop/docs/adr/0000-template-lightway-adr.md\n", - "40. re/README.md - https://github.com/wireapp/re/blob/master/README.md\n" - ] + ], + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" } - ], - "execution_count": 2 - } - ], - "metadata": { - "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file