Skip to content

Commit

Permalink
Merge pull request #14 from langchain-ai/eugene/update_ingestion
Browse files Browse the repository at this point in the history
Add file parsing based on mimetype
  • Loading branch information
eyurtsev authored Nov 10, 2023
2 parents b92711f + d25e543 commit f4dc73b
Show file tree
Hide file tree
Showing 18 changed files with 432 additions and 26 deletions.
2 changes: 1 addition & 1 deletion backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from fastapi import FastAPI, Form, Request, UploadFile
from fastapi.staticfiles import StaticFiles
from gizmo_agent import agent, ingest_runnable
from langserve import add_routes
from langchain.schema.runnable import RunnableConfig
from langserve import add_routes

from app.storage import (
get_thread_messages,
Expand Down
51 changes: 51 additions & 0 deletions backend/packages/agent-executor/agent_executor/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Code to ingest blob into a vectorstore.
Code is responsible for taking binary data, parsing it and then indexing it
into a vector store.
This code should be agnostic to how the blob got generated; i.e., it does not
know about server/uploading etc.
"""
from typing import List

from langchain.document_loaders import Blob
from langchain.document_loaders.base import BaseBlobParser
from langchain.schema import Document
from langchain.schema.vectorstore import VectorStore
from langchain.text_splitter import TextSplitter


def _update_document_metadata(document: Document, namespace: str) -> None:
"""Mutation in place that adds a namespace to the document metadata."""
document.metadata["namespace"] = namespace


# PUBLIC API


def ingest_blob(
blob: Blob,
parser: BaseBlobParser,
text_splitter: TextSplitter,
vectorstore: VectorStore,
namespace: str,
*,
batch_size: int = 100,
) -> List[str]:
"""Ingest a document into the vectorstore."""
docs_to_index = []
ids = []
for document in parser.lazy_parse(blob):
docs = text_splitter.split_documents([document])
for doc in docs:
_update_document_metadata(doc, namespace)
docs_to_index.extend(docs)

if len(docs_to_index) >= batch_size:
ids.extend(vectorstore.add_documents(docs_to_index))
docs_to_index = []

if docs_to_index:
ids.extend(vectorstore.add_documents(docs_to_index))

return ids
24 changes: 24 additions & 0 deletions backend/packages/agent-executor/agent_executor/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Module contains logic for parsing binary blobs into text."""
from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain.document_loaders.parsers.msword import MsWordParser
from langchain.document_loaders.parsers.txt import TextParser

HANDLERS = {
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/html": BS4HTMLParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
MsWordParser()
),
}

SUPPORTED_MIMETYPES = sorted(HANDLERS.keys())

# PUBLIC API

MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
handlers=HANDLERS,
fallback_parser=None,
)
60 changes: 54 additions & 6 deletions backend/packages/agent-executor/agent_executor/upload.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,49 @@
"""API to deal with file uploads via a runnable.
For now this code assumes that the content is a base64 encoded string.
The details here might change in the future.
For the time being, upload and ingestion are coupled
"""
from __future__ import annotations

from typing import Any, BinaryIO, List, Optional

from langchain.document_loaders.blob_loaders.schema import Blob
from langchain.schema.runnable import RunnableConfig, RunnableSerializable
from langchain.schema.vectorstore import VectorStore
from langchain.text_splitter import TextSplitter

from agent_executor.ingest import ingest_blob
from agent_executor.parsing import MIMETYPE_BASED_PARSER


def _guess_mimetype(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
try:
import magic
except ImportError:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
)

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_bytes)
return mime_type


def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
"""Convert ingestion input to blob."""
file_data = data.read()
mimetype = _guess_mimetype(file_data)
file_name = data.name
return Blob.from_data(
data=file_data,
path=file_name,
mime_type=mimetype,
)


class IngestRunnable(RunnableSerializable[BinaryIO, List[str]]):
text_splitter: TextSplitter
Expand Down Expand Up @@ -33,9 +73,17 @@ def batch(
return_exceptions: bool = False,
**kwargs: Any | None,
) -> List:
docs = self.text_splitter.create_documents(
# TODO change this line to accept binary formats
[part.read().decode() for part in inputs],
[{"namespace": self.namespace}],
)
return self.vectorstore.add_documents(docs)
"""Ingest a batch of files into the vectorstore."""
ids = []
for data in inputs:
blob = _convert_ingestion_input_to_blob(data)
ids.extend(
ingest_blob(
blob,
MIMETYPE_BASED_PARSER,
self.text_splitter,
self.vectorstore,
self.namespace,
)
)
return ids
1 change: 0 additions & 1 deletion backend/packages/gizmo-agent/gizmo_agent/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
ingest_runnable = IngestRunnable(
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200),
vectorstore=vstore,
input_key="file_contents",
).configurable_fields(
assistant_id=ConfigurableField(
id="assistant_id",
Expand Down
4 changes: 4 additions & 0 deletions backend/packages/gizmo-agent/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ openai = ">=0.5.0,<1.0"
anthropic = "^0.3.11"
langchain-experimental = "^0.0.37"
duckduckgo-search = "^3.9.4"
python-magic = "^0.4.27"
bs4 = "^0.0.1"
unstructured = {extras = ["doc", "docx"], version = "^0.10.29"}
pdfminer-six = "^20221105"

[tool.poetry.group.dev.dependencies]
langchain-cli = ">=0.0.15"
Expand Down
18 changes: 0 additions & 18 deletions backend/tests/unit_tests/agent_executor/test_ingestion.py

This file was deleted.

41 changes: 41 additions & 0 deletions backend/tests/unit_tests/agent_executor/test_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Test parsing logic."""
import mimetypes

from agent_executor.parsing import MIMETYPE_BASED_PARSER, SUPPORTED_MIMETYPES
from langchain.document_loaders import Blob

from tests.unit_tests.fixtures import get_sample_paths


def test_list_of_supported_mimetypes() -> None:
"""This list should generally grow! Protecting against typos in mimetypes."""
assert SUPPORTED_MIMETYPES == [
"application/msword",
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/html",
"text/plain",
]


def test_attempt_to_parse_each_fixture() -> None:
"""Attempt to parse supported fixtures."""
seen_mimetypes = set()
for path in get_sample_paths():
type_, _ = mimetypes.guess_type(path)
if type_ not in SUPPORTED_MIMETYPES:
continue
seen_mimetypes.add(type_)
blob = Blob.from_path(path)
documents = MIMETYPE_BASED_PARSER.parse(blob)
try:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
except Exception as e:
raise AssertionError(f"Failed to parse {path}") from e

known_missing = {"application/msword"}
assert set(SUPPORTED_MIMETYPES) - known_missing == seen_mimetypes
43 changes: 43 additions & 0 deletions backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from io import BytesIO

from agent_executor.upload import IngestRunnable, _guess_mimetype
from langchain.text_splitter import RecursiveCharacterTextSplitter

from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore


def test_ingestion_runnable() -> None:
"""Test ingestion runnable"""
vectorstore = InMemoryVectorStore()
splitter = RecursiveCharacterTextSplitter()
runnable = IngestRunnable(
text_splitter=splitter,
vectorstore=vectorstore,
input_key="file_contents",
assistant_id="TheParrot",
)
data = BytesIO(b"test")
data.name = "filename"
ids = runnable.invoke(data)
assert len(ids) == 1


def test_mimetype_guessing() -> None:
"""Verify mimetype guessing for all fixtures."""
name_to_mime = {}
for file in sorted(get_sample_paths()):
data = file.read_bytes()
name_to_mime[file.name] = _guess_mimetype(data)

assert {
"sample.docx": (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
),
"sample.epub": "application/epub+zip",
"sample.html": "text/html",
"sample.odt": "application/vnd.oasis.opendocument.text",
"sample.pdf": "application/pdf",
"sample.rtf": "text/rtf",
"sample.txt": "text/plain",
} == name_to_mime
11 changes: 11 additions & 0 deletions backend/tests/unit_tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pathlib import Path
from typing import List

HERE = Path(__file__).parent

# PUBLIC API


def get_sample_paths() -> List[Path]:
"""List all fixtures."""
return list(HERE.glob("sample.*"))
Binary file added backend/tests/unit_tests/fixtures/sample.docx
Binary file not shown.
Binary file added backend/tests/unit_tests/fixtures/sample.epub
Binary file not shown.
1 change: 1 addition & 0 deletions backend/tests/unit_tests/fixtures/sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">.lst-kix_n6n0tzfwn8i8-5>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-6>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-8{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-7{list-style-type:none}.lst-kix_n6n0tzfwn8i8-3>li:before{content:"\0025cf "}.lst-kix_n6n0tzfwn8i8-4>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-7>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-8>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-1>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-2>li:before{content:"\0025a0 "}li.li-bullet-0:before{margin-left:-18pt;white-space:nowrap;display:inline-block;min-width:18pt}.lst-kix_n6n0tzfwn8i8-0>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-2{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-1{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-0{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-6{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-5{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-4{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-3{list-style-type:none}ol{margin:0;padding:0}table td,table th{padding:0}.c6{border-right-style:solid;padding:5pt 5pt 5pt 5pt;border-bottom-color:#000000;border-top-width:1pt;border-right-width:1pt;border-left-color:#000000;vertical-align:top;border-right-color:#000000;border-left-width:1pt;border-top-style:solid;border-left-style:solid;border-bottom-width:1pt;width:156pt;border-top-color:#000000;border-bottom-style:solid}.c0{-webkit-text-decoration-skip:none;color:#000000;font-weight:400;text-decoration:underline;vertical-align:baseline;text-decoration-skip-ink:none;font-size:11pt;font-family:"Arial";font-style:normal}.c4{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left;height:11pt}.c11{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:italic}.c3{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c12{color:#000000;font-weight:700;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c7{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c8{text-decoration-skip-ink:none;-webkit-text-decoration-skip:none;color:#1155cc;text-decoration:underline}.c14{border-spacing:0;border-collapse:collapse;margin-right:auto}.c13{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.c15{padding:0;margin:0}.c10{margin-left:36pt;padding-left:0pt}.c5{color:inherit;text-decoration:inherit}.c9{height:11pt}.c2{height:0pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c13 doc-content"><p class="c1"><span class="c3">🦜️ LangChain</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c0">Underline</span></p><p class="c1 c9"><span class="c0"></span></p><p class="c1"><span class="c12">Bold</span></p><p class="c1 c9"><span class="c12"></span></p><p class="c1"><span class="c11">Italics</span></p><p class="c1 c9"><span class="c11"></span></p><p class="c1 c9"><span class="c11"></span></p><a id="t.e89270b97fc18eabe5c666cba79cd82cff5b5c3d"></a><a id="t.0"></a><table class="c14"><tbody><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c4"><span class="c12"></span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 2</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">3</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">4</span></p></td></tr></tbody></table><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span>Link: </span><span class="c8"><a class="c5" href="https://www.google.com/url?q=https://www.langchain.com/&amp;sa=D&amp;source=editors&amp;ust=1699572948600868&amp;usg=AOvVaw2T4jvAmPuMvcyed6PrEjq1">https://www.langchain.com/</a></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><ul class="c15 lst-kix_n6n0tzfwn8i8-0 start"><li class="c1 c10 li-bullet-0"><span class="c3">Item 1</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 2</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 3</span></li><li class="c1 c10 li-bullet-0"><span class="c3">We also love cats 🐱</span></li></ul><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c3">Image</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 624.00px; height: 132.00px;"><img alt="" src="sample_files/image1.png" style="width: 624.00px; height: 132.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p></body></html>
Binary file added backend/tests/unit_tests/fixtures/sample.odt
Binary file not shown.
Binary file added backend/tests/unit_tests/fixtures/sample.pdf
Binary file not shown.
Loading

0 comments on commit f4dc73b

Please sign in to comment.