From ad595d32f6ca42a25438e53ff26e430451873552 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Mon, 21 Aug 2023 23:00:21 -0400 Subject: [PATCH] enhancement: tell users to install missing extras (#1167) ### Summary Updates `partition` to let users know to installs the appropriate extras if they're missing. Prior to this PR, users would get an exception stating `partition_pdf` (or whichever function that requires extras) does not exist. ### Testing First `pip uninstall ebooklib`. Then run ```python from unstructured.partition.auto import partition partition(filename="example-docs/winter-sports.epub") ``` The error should look like ```python ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]" ``` --- CHANGELOG.md | 4 +- setup.py | 4 + test_unstructured/partition/test_auto.py | 27 +++++-- unstructured/__version__.py | 2 +- unstructured/partition/auto.py | 93 ++++++++++++++++++++---- 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 948ef5881f..ad32c050c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.10.5-dev1 +## 0.10.5-dev2 ### Enhancements * Create new CI Pipelines - Checking text, xml, email, and html doc tests against the library installed without extras - Checking each library extra against their respective tests +* `partition` raises and error and tells the user to install the appropriate extra if a filetype + is detected that is missing dependencies. ## 0.10.3 * Adds ability to reuse connections per process in unstructured-ingest diff --git a/setup.py b/setup.py index d2e9809c3d..183d886e74 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List csv_reqs = load_requirements("requirements/extra-csv.in") +doc_reqs = load_requirements("requirements/extra-docx.in") docx_reqs = load_requirements("requirements/extra-docx.in") epub_reqs = load_requirements("requirements/extra-epub.in") image_reqs = load_requirements("requirements/extra-pdf-image.in") @@ -48,6 +49,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List odt_reqs = load_requirements("requirements/extra-odt.in") org_reqs = load_requirements("requirements/extra-pandoc.in") pdf_reqs = load_requirements("requirements/extra-pdf-image.in") +ppt_reqs = load_requirements("requirements/extra-pptx.in") pptx_reqs = load_requirements("requirements/extra-pptx.in") rtf_reqs = load_requirements("requirements/extra-pandoc.in") rst_reqs = load_requirements("requirements/extra-pandoc.in") @@ -109,6 +111,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List # Document specific extra requirements "all-docs": all_doc_reqs, "csv": csv_reqs, + "doc": doc_reqs, "docx": docx_reqs, "epub": epub_reqs, "image": image_reqs, @@ -117,6 +120,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List "odt": odt_reqs, "org": org_reqs, "pdf": pdf_reqs, + "ppt": ppt_reqs, "pptx": pptx_reqs, "rtf": rtf_reqs, "rst": rst_reqs, diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 4389e71522..efe846624d 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -21,7 +21,7 @@ ) from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType from unstructured.partition import auto -from unstructured.partition.auto import partition +from unstructured.partition.auto import _get_partition_with_extras, partition from unstructured.partition.common import convert_office_doc from unstructured.staging.base import elements_to_json @@ -321,11 +321,13 @@ def test_auto_partition_pdf_uses_table_extraction(): assert mock_process_file_with_model.call_args[1]["extract_tables"] -def test_auto_partition_pdf_with_fast_strategy(): +def test_auto_partition_pdf_with_fast_strategy(monkeypatch): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") mock_return = [NarrativeText("Hello there!")] with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition: + mock_partition_with_extras_map = {"pdf": mock_partition} + monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) partition(filename=filename, strategy="fast") mock_partition.assert_called_once_with( @@ -563,11 +565,13 @@ def test_auto_partition_odt_from_file(): ("jdsfjdfsjkds", "pdf", None), ], ) -def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected): +def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch): with patch( f"unstructured.partition.auto.partition_{routing_func}", lambda *args, **kwargs: [Text("text 1"), Text("text 2")], - ): + ) as mock_partition: + mock_partition_with_extras_map = {routing_func: mock_partition} + monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type) assert len(elements) == 2 assert all(el.metadata.filetype == expected for el in elements) @@ -580,7 +584,7 @@ def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected): (None, FILETYPE_TO_MIMETYPE[FileType.PDF]), ], ) -def test_auto_filetype_overrides_file_specific(content_type, expected): +def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch): pdf_metadata = ElementMetadata(filetype="imapdf") with patch( "unstructured.partition.auto.partition_pdf", @@ -588,7 +592,9 @@ def test_auto_filetype_overrides_file_specific(content_type, expected): Text("text 1", metadata=pdf_metadata), Text("text 2", metadata=pdf_metadata), ], - ): + ) as mock_partition: + mock_partition_with_extras_map = {"pdf": mock_partition} + monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type) assert len(elements) == 2 assert all(el.metadata.filetype == expected for el in elements) @@ -873,3 +879,12 @@ def test_auto_partition_metadata_file_filename(): with open(filename) as f: elements = partition(file=f, file_filename=filename) assert elements[0].metadata.filename == os.path.split(filename)[-1] + + +def test_get_partition_with_extras_prompts_for_install_if_missing(): + partition_with_extras_map = {} + with pytest.raises(ImportError) as exception_info: + _get_partition_with_extras("pdf", partition_with_extras_map) + + msg = str(exception_info.value) + assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4b35007c64..ec85281604 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.5-dev1" # pragma: no cover +__version__ = "0.10.5-dev2" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index d15649b4fc..d04f78da0d 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -20,55 +20,100 @@ from unstructured.partition.xml import partition_xml from unstructured.utils import dependency_exists +PARTITION_WITH_EXTRAS_MAP: Dict[str, Callable] = {} + if dependency_exists("pandas"): from unstructured.partition.csv import partition_csv from unstructured.partition.tsv import partition_tsv + PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv + PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv + if dependency_exists("docx"): from unstructured.partition.doc import partition_doc from unstructured.partition.docx import partition_docx + PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc + PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx + if dependency_exists("docx") and dependency_exists("pypandoc"): from unstructured.partition.odt import partition_odt + PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt + if dependency_exists("ebooklib"): from unstructured.partition.epub import partition_epub + PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub + if dependency_exists("pypandoc"): from unstructured.partition.org import partition_org from unstructured.partition.rst import partition_rst from unstructured.partition.rtf import partition_rtf + PARTITION_WITH_EXTRAS_MAP["org"] = partition_org + PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst + PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf + if dependency_exists("markdown"): from unstructured.partition.md import partition_md + PARTITION_WITH_EXTRAS_MAP["md"] = partition_md + if dependency_exists("msg_parser"): from unstructured.partition.msg import partition_msg + PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg + pdf_imports = ["pdf2image", "pdfminer", "PIL"] if all(dependency_exists(dep) for dep in pdf_imports): from unstructured.partition.pdf import partition_pdf + PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf + if dependency_exists("unstructured_inference"): from unstructured.partition.image import partition_image + PARTITION_WITH_EXTRAS_MAP["image"] = partition_image + if dependency_exists("pptx"): from unstructured.partition.ppt import partition_ppt from unstructured.partition.pptx import partition_pptx + PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt + PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx + if dependency_exists("pandas") and dependency_exists("openpyxl"): from unstructured.partition.xlsx import partition_xlsx + PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx + + +def _get_partition_with_extras( + doc_type: str, + partition_with_extras_map: Optional[Dict[str, Callable]] = None, +): + if partition_with_extras_map is None: + partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP + _partition_func = partition_with_extras_map.get(doc_type) + if _partition_func is None: + raise ImportError( + f"partition_{doc_type} is not available. " + f"Install the {doc_type} dependencies with " + f'pip install "unstructured[{doc_type}]"', + ) + return _partition_func + def partition( filename: Optional[str] = None, @@ -170,15 +215,19 @@ def partition( kwargs.setdefault("metadata_filename", file_filename) if filetype == FileType.DOC: - elements = partition_doc(filename=filename, file=file, **kwargs) + _partition_doc = _get_partition_with_extras("doc") + elements = _partition_doc(filename=filename, file=file, **kwargs) elif filetype == FileType.DOCX: - elements = partition_docx(filename=filename, file=file, **kwargs) + _partition_docx = _get_partition_with_extras("docx") + elements = _partition_docx(filename=filename, file=file, **kwargs) elif filetype == FileType.ODT: - elements = partition_odt(filename=filename, file=file, **kwargs) + _partition_odt = _get_partition_with_extras("odt") + elements = _partition_odt(filename=filename, file=file, **kwargs) elif filetype == FileType.EML: elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs) elif filetype == FileType.MSG: - elements = partition_msg(filename=filename, file=file, **kwargs) + _partition_msg = _get_partition_with_extras("msg") + elements = _partition_msg(filename=filename, file=file, **kwargs) elif filetype == FileType.HTML: elements = partition_html( filename=filename, @@ -196,35 +245,40 @@ def partition( **kwargs, ) elif filetype == FileType.EPUB: - elements = partition_epub( + _partition_epub = _get_partition_with_extras("epub") + elements = _partition_epub( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.ORG: - elements = partition_org( + _partition_org = _get_partition_with_extras("org") + elements = _partition_org( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.RST: - elements = partition_rst( + _partition_rst = _get_partition_with_extras("rst") + elements = _partition_rst( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.MD: - elements = partition_md( + _partition_md = _get_partition_with_extras("md") + elements = _partition_md( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.PDF: - elements = partition_pdf( + _partition_pdf = _get_partition_with_extras("pdf") + elements = _partition_pdf( filename=filename, # type: ignore file=file, # type: ignore url=None, @@ -235,7 +289,8 @@ def partition( **kwargs, ) elif (filetype == FileType.PNG) or (filetype == FileType.JPG): - elements = partition_image( + _partition_image = _get_partition_with_extras("image") + elements = _partition_image( filename=filename, # type: ignore file=file, # type: ignore url=None, @@ -254,21 +309,24 @@ def partition( **kwargs, ) elif filetype == FileType.RTF: - elements = partition_rtf( + _partition_rtf = _get_partition_with_extras("rtf") + elements = _partition_rtf( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.PPT: - elements = partition_ppt( + _partition_ppt = _get_partition_with_extras("ppt") + elements = _partition_ppt( filename=filename, file=file, include_page_breaks=include_page_breaks, **kwargs, ) elif filetype == FileType.PPTX: - elements = partition_pptx( + _partition_pptx = _get_partition_with_extras("pptx") + elements = _partition_pptx( filename=filename, file=file, include_page_breaks=include_page_breaks, @@ -282,11 +340,14 @@ def partition( ) elements = partition_json(filename=filename, file=file, **kwargs) elif (filetype == FileType.XLSX) or (filetype == FileType.XLS): - elements = partition_xlsx(filename=filename, file=file, **kwargs) + _partition_xlsx = _get_partition_with_extras("xlsx") + elements = _partition_xlsx(filename=filename, file=file, **kwargs) elif filetype == FileType.CSV: - elements = partition_csv(filename=filename, file=file, **kwargs) + _partition_csv = _get_partition_with_extras("csv") + elements = _partition_csv(filename=filename, file=file, **kwargs) elif filetype == FileType.TSV: - elements = partition_tsv(filename=filename, file=file, **kwargs) + _partition_tsv = _get_partition_with_extras("tsv") + elements = _partition_tsv(filename=filename, file=file, **kwargs) elif filetype == FileType.EMPTY: elements = [] else: