-
Notifications
You must be signed in to change notification settings - Fork 852
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from langchain-ai/eugene/update_ingestion
Add file parsing based on mimetype
- Loading branch information
Showing
18 changed files
with
432 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
"""Code to ingest blob into a vectorstore. | ||
Code is responsible for taking binary data, parsing it and then indexing it | ||
into a vector store. | ||
This code should be agnostic to how the blob got generated; i.e., it does not | ||
know about server/uploading etc. | ||
""" | ||
from typing import List | ||
|
||
from langchain.document_loaders import Blob | ||
from langchain.document_loaders.base import BaseBlobParser | ||
from langchain.schema import Document | ||
from langchain.schema.vectorstore import VectorStore | ||
from langchain.text_splitter import TextSplitter | ||
|
||
|
||
def _update_document_metadata(document: Document, namespace: str) -> None: | ||
"""Mutation in place that adds a namespace to the document metadata.""" | ||
document.metadata["namespace"] = namespace | ||
|
||
|
||
# PUBLIC API | ||
|
||
|
||
def ingest_blob( | ||
blob: Blob, | ||
parser: BaseBlobParser, | ||
text_splitter: TextSplitter, | ||
vectorstore: VectorStore, | ||
namespace: str, | ||
*, | ||
batch_size: int = 100, | ||
) -> List[str]: | ||
"""Ingest a document into the vectorstore.""" | ||
docs_to_index = [] | ||
ids = [] | ||
for document in parser.lazy_parse(blob): | ||
docs = text_splitter.split_documents([document]) | ||
for doc in docs: | ||
_update_document_metadata(doc, namespace) | ||
docs_to_index.extend(docs) | ||
|
||
if len(docs_to_index) >= batch_size: | ||
ids.extend(vectorstore.add_documents(docs_to_index)) | ||
docs_to_index = [] | ||
|
||
if docs_to_index: | ||
ids.extend(vectorstore.add_documents(docs_to_index)) | ||
|
||
return ids |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
"""Module contains logic for parsing binary blobs into text.""" | ||
from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser | ||
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser | ||
from langchain.document_loaders.parsers.msword import MsWordParser | ||
from langchain.document_loaders.parsers.txt import TextParser | ||
|
||
HANDLERS = { | ||
"application/pdf": PDFMinerParser(), | ||
"text/plain": TextParser(), | ||
"text/html": BS4HTMLParser(), | ||
"application/msword": MsWordParser(), | ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ( | ||
MsWordParser() | ||
), | ||
} | ||
|
||
SUPPORTED_MIMETYPES = sorted(HANDLERS.keys()) | ||
|
||
# PUBLIC API | ||
|
||
MIMETYPE_BASED_PARSER = MimeTypeBasedParser( | ||
handlers=HANDLERS, | ||
fallback_parser=None, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
"""Test parsing logic.""" | ||
import mimetypes | ||
|
||
from agent_executor.parsing import MIMETYPE_BASED_PARSER, SUPPORTED_MIMETYPES | ||
from langchain.document_loaders import Blob | ||
|
||
from tests.unit_tests.fixtures import get_sample_paths | ||
|
||
|
||
def test_list_of_supported_mimetypes() -> None: | ||
"""This list should generally grow! Protecting against typos in mimetypes.""" | ||
assert SUPPORTED_MIMETYPES == [ | ||
"application/msword", | ||
"application/pdf", | ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||
"text/html", | ||
"text/plain", | ||
] | ||
|
||
|
||
def test_attempt_to_parse_each_fixture() -> None: | ||
"""Attempt to parse supported fixtures.""" | ||
seen_mimetypes = set() | ||
for path in get_sample_paths(): | ||
type_, _ = mimetypes.guess_type(path) | ||
if type_ not in SUPPORTED_MIMETYPES: | ||
continue | ||
seen_mimetypes.add(type_) | ||
blob = Blob.from_path(path) | ||
documents = MIMETYPE_BASED_PARSER.parse(blob) | ||
try: | ||
assert len(documents) == 1 | ||
doc = documents[0] | ||
assert "source" in doc.metadata | ||
assert doc.metadata["source"] == str(path) | ||
assert "🦜" in doc.page_content | ||
except Exception as e: | ||
raise AssertionError(f"Failed to parse {path}") from e | ||
|
||
known_missing = {"application/msword"} | ||
assert set(SUPPORTED_MIMETYPES) - known_missing == seen_mimetypes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from io import BytesIO | ||
|
||
from agent_executor.upload import IngestRunnable, _guess_mimetype | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
|
||
from tests.unit_tests.fixtures import get_sample_paths | ||
from tests.unit_tests.utils import InMemoryVectorStore | ||
|
||
|
||
def test_ingestion_runnable() -> None: | ||
"""Test ingestion runnable""" | ||
vectorstore = InMemoryVectorStore() | ||
splitter = RecursiveCharacterTextSplitter() | ||
runnable = IngestRunnable( | ||
text_splitter=splitter, | ||
vectorstore=vectorstore, | ||
input_key="file_contents", | ||
assistant_id="TheParrot", | ||
) | ||
data = BytesIO(b"test") | ||
data.name = "filename" | ||
ids = runnable.invoke(data) | ||
assert len(ids) == 1 | ||
|
||
|
||
def test_mimetype_guessing() -> None: | ||
"""Verify mimetype guessing for all fixtures.""" | ||
name_to_mime = {} | ||
for file in sorted(get_sample_paths()): | ||
data = file.read_bytes() | ||
name_to_mime[file.name] = _guess_mimetype(data) | ||
|
||
assert { | ||
"sample.docx": ( | ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||
), | ||
"sample.epub": "application/epub+zip", | ||
"sample.html": "text/html", | ||
"sample.odt": "application/vnd.oasis.opendocument.text", | ||
"sample.pdf": "application/pdf", | ||
"sample.rtf": "text/rtf", | ||
"sample.txt": "text/plain", | ||
} == name_to_mime |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from pathlib import Path | ||
from typing import List | ||
|
||
HERE = Path(__file__).parent | ||
|
||
# PUBLIC API | ||
|
||
|
||
def get_sample_paths() -> List[Path]: | ||
"""List all fixtures.""" | ||
return list(HERE.glob("sample.*")) |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">.lst-kix_n6n0tzfwn8i8-5>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-6>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-8{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-7{list-style-type:none}.lst-kix_n6n0tzfwn8i8-3>li:before{content:"\0025cf "}.lst-kix_n6n0tzfwn8i8-4>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-7>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-8>li:before{content:"\0025a0 "}.lst-kix_n6n0tzfwn8i8-1>li:before{content:"\0025cb "}.lst-kix_n6n0tzfwn8i8-2>li:before{content:"\0025a0 "}li.li-bullet-0:before{margin-left:-18pt;white-space:nowrap;display:inline-block;min-width:18pt}.lst-kix_n6n0tzfwn8i8-0>li:before{content:"\0025cf "}ul.lst-kix_n6n0tzfwn8i8-2{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-1{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-0{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-6{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-5{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-4{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-3{list-style-type:none}ol{margin:0;padding:0}table td,table th{padding:0}.c6{border-right-style:solid;padding:5pt 5pt 5pt 5pt;border-bottom-color:#000000;border-top-width:1pt;border-right-width:1pt;border-left-color:#000000;vertical-align:top;border-right-color:#000000;border-left-width:1pt;border-top-style:solid;border-left-style:solid;border-bottom-width:1pt;width:156pt;border-top-color:#000000;border-bottom-style:solid}.c0{-webkit-text-decoration-skip:none;color:#000000;font-weight:400;text-decoration:underline;vertical-align:baseline;text-decoration-skip-ink:none;font-size:11pt;font-family:"Arial";font-style:normal}.c4{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left;height:11pt}.c11{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:italic}.c3{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c12{color:#000000;font-weight:700;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c7{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c8{text-decoration-skip-ink:none;-webkit-text-decoration-skip:none;color:#1155cc;text-decoration:underline}.c14{border-spacing:0;border-collapse:collapse;margin-right:auto}.c13{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.c15{padding:0;margin:0}.c10{margin-left:36pt;padding-left:0pt}.c5{color:inherit;text-decoration:inherit}.c9{height:11pt}.c2{height:0pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c13 doc-content"><p class="c1"><span class="c3">🦜️ LangChain</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c0">Underline</span></p><p class="c1 c9"><span class="c0"></span></p><p class="c1"><span class="c12">Bold</span></p><p class="c1 c9"><span class="c12"></span></p><p class="c1"><span class="c11">Italics</span></p><p class="c1 c9"><span class="c11"></span></p><p class="c1 c9"><span class="c11"></span></p><a id="t.e89270b97fc18eabe5c666cba79cd82cff5b5c3d"></a><a id="t.0"></a><table class="c14"><tbody><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c4"><span class="c12"></span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 2</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">3</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">4</span></p></td></tr></tbody></table><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span>Link: </span><span class="c8"><a class="c5" href="https://www.google.com/url?q=https://www.langchain.com/&sa=D&source=editors&ust=1699572948600868&usg=AOvVaw2T4jvAmPuMvcyed6PrEjq1">https://www.langchain.com/</a></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><ul class="c15 lst-kix_n6n0tzfwn8i8-0 start"><li class="c1 c10 li-bullet-0"><span class="c3">Item 1</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 2</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 3</span></li><li class="c1 c10 li-bullet-0"><span class="c3">We also love cats 🐱</span></li></ul><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c3">Image</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 624.00px; height: 132.00px;"><img alt="" src="sample_files/image1.png" style="width: 624.00px; height: 132.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p></body></html> |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.