From ad595d32f6ca42a25438e53ff26e430451873552 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Mon, 21 Aug 2023 23:00:21 -0400
Subject: [PATCH] enhancement: tell users to install missing extras (#1167)

### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
---
 CHANGELOG.md                             |  4 +-
 setup.py                                 |  4 +
 test_unstructured/partition/test_auto.py | 27 +++++--
 unstructured/__version__.py              |  2 +-
 unstructured/partition/auto.py           | 93 ++++++++++++++++++++----
 5 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 948ef5881f..ad32c050c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,11 @@
-## 0.10.5-dev1
+## 0.10.5-dev2
 
 ### Enhancements
 * Create new CI Pipelines
   - Checking text, xml, email, and html doc tests against the library installed without extras
   - Checking each library extra against their respective tests
+* `partition` raises and error and tells the user to install the appropriate extra if a filetype
+  is detected that is missing dependencies.
 
 ## 0.10.3
 * Adds ability to reuse connections per process in unstructured-ingest
diff --git a/setup.py b/setup.py
index d2e9809c3d..183d886e74 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
 
 
 csv_reqs = load_requirements("requirements/extra-csv.in")
+doc_reqs = load_requirements("requirements/extra-docx.in")
 docx_reqs = load_requirements("requirements/extra-docx.in")
 epub_reqs = load_requirements("requirements/extra-epub.in")
 image_reqs = load_requirements("requirements/extra-pdf-image.in")
@@ -48,6 +49,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
 odt_reqs = load_requirements("requirements/extra-odt.in")
 org_reqs = load_requirements("requirements/extra-pandoc.in")
 pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
+ppt_reqs = load_requirements("requirements/extra-pptx.in")
 pptx_reqs = load_requirements("requirements/extra-pptx.in")
 rtf_reqs = load_requirements("requirements/extra-pandoc.in")
 rst_reqs = load_requirements("requirements/extra-pandoc.in")
@@ -109,6 +111,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         # Document specific extra requirements
         "all-docs": all_doc_reqs,
         "csv": csv_reqs,
+        "doc": doc_reqs,
         "docx": docx_reqs,
         "epub": epub_reqs,
         "image": image_reqs,
@@ -117,6 +120,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
         "odt": odt_reqs,
         "org": org_reqs,
         "pdf": pdf_reqs,
+        "ppt": ppt_reqs,
         "pptx": pptx_reqs,
         "rtf": rtf_reqs,
         "rst": rst_reqs,
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 4389e71522..efe846624d 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -21,7 +21,7 @@
 )
 from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
 from unstructured.partition import auto
-from unstructured.partition.auto import partition
+from unstructured.partition.auto import _get_partition_with_extras, partition
 from unstructured.partition.common import convert_office_doc
 from unstructured.staging.base import elements_to_json
 
@@ -321,11 +321,13 @@ def test_auto_partition_pdf_uses_table_extraction():
         assert mock_process_file_with_model.call_args[1]["extract_tables"]
 
 
-def test_auto_partition_pdf_with_fast_strategy():
+def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
 
     mock_return = [NarrativeText("Hello there!")]
     with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
+        mock_partition_with_extras_map = {"pdf": mock_partition}
+        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
         partition(filename=filename, strategy="fast")
 
     mock_partition.assert_called_once_with(
@@ -563,11 +565,13 @@ def test_auto_partition_odt_from_file():
         ("jdsfjdfsjkds", "pdf", None),
     ],
 )
-def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
+def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
     with patch(
         f"unstructured.partition.auto.partition_{routing_func}",
         lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
-    ):
+    ) as mock_partition:
+        mock_partition_with_extras_map = {routing_func: mock_partition}
+        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
         elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
     assert len(elements) == 2
     assert all(el.metadata.filetype == expected for el in elements)
@@ -580,7 +584,7 @@ def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
         (None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
     ],
 )
-def test_auto_filetype_overrides_file_specific(content_type, expected):
+def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
     pdf_metadata = ElementMetadata(filetype="imapdf")
     with patch(
         "unstructured.partition.auto.partition_pdf",
@@ -588,7 +592,9 @@ def test_auto_filetype_overrides_file_specific(content_type, expected):
             Text("text 1", metadata=pdf_metadata),
             Text("text 2", metadata=pdf_metadata),
         ],
-    ):
+    ) as mock_partition:
+        mock_partition_with_extras_map = {"pdf": mock_partition}
+        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
         elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
     assert len(elements) == 2
     assert all(el.metadata.filetype == expected for el in elements)
@@ -873,3 +879,12 @@ def test_auto_partition_metadata_file_filename():
     with open(filename) as f:
         elements = partition(file=f, file_filename=filename)
     assert elements[0].metadata.filename == os.path.split(filename)[-1]
+
+
+def test_get_partition_with_extras_prompts_for_install_if_missing():
+    partition_with_extras_map = {}
+    with pytest.raises(ImportError) as exception_info:
+        _get_partition_with_extras("pdf", partition_with_extras_map)
+
+    msg = str(exception_info.value)
+    assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 4b35007c64..ec85281604 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.5-dev1"  # pragma: no cover
+__version__ = "0.10.5-dev2"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index d15649b4fc..d04f78da0d 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -20,55 +20,100 @@
 from unstructured.partition.xml import partition_xml
 from unstructured.utils import dependency_exists
 
+PARTITION_WITH_EXTRAS_MAP: Dict[str, Callable] = {}
+
 if dependency_exists("pandas"):
     from unstructured.partition.csv import partition_csv
     from unstructured.partition.tsv import partition_tsv
 
+    PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
+    PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv
+
 
 if dependency_exists("docx"):
     from unstructured.partition.doc import partition_doc
     from unstructured.partition.docx import partition_docx
 
+    PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
+    PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx
+
 
 if dependency_exists("docx") and dependency_exists("pypandoc"):
     from unstructured.partition.odt import partition_odt
 
+    PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt
+
 
 if dependency_exists("ebooklib"):
     from unstructured.partition.epub import partition_epub
 
+    PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub
+
 
 if dependency_exists("pypandoc"):
     from unstructured.partition.org import partition_org
     from unstructured.partition.rst import partition_rst
     from unstructured.partition.rtf import partition_rtf
 
+    PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
+    PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
+    PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf
+
 
 if dependency_exists("markdown"):
     from unstructured.partition.md import partition_md
 
+    PARTITION_WITH_EXTRAS_MAP["md"] = partition_md
+
 
 if dependency_exists("msg_parser"):
     from unstructured.partition.msg import partition_msg
 
+    PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg
+
 
 pdf_imports = ["pdf2image", "pdfminer", "PIL"]
 if all(dependency_exists(dep) for dep in pdf_imports):
     from unstructured.partition.pdf import partition_pdf
 
+    PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf
+
 
 if dependency_exists("unstructured_inference"):
     from unstructured.partition.image import partition_image
 
+    PARTITION_WITH_EXTRAS_MAP["image"] = partition_image
+
 
 if dependency_exists("pptx"):
     from unstructured.partition.ppt import partition_ppt
     from unstructured.partition.pptx import partition_pptx
 
+    PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
+    PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx
+
 
 if dependency_exists("pandas") and dependency_exists("openpyxl"):
     from unstructured.partition.xlsx import partition_xlsx
 
+    PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx
+
+
+def _get_partition_with_extras(
+    doc_type: str,
+    partition_with_extras_map: Optional[Dict[str, Callable]] = None,
+):
+    if partition_with_extras_map is None:
+        partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
+    _partition_func = partition_with_extras_map.get(doc_type)
+    if _partition_func is None:
+        raise ImportError(
+            f"partition_{doc_type} is not available. "
+            f"Install the {doc_type} dependencies with "
+            f'pip install "unstructured[{doc_type}]"',
+        )
+    return _partition_func
+
 
 def partition(
     filename: Optional[str] = None,
@@ -170,15 +215,19 @@ def partition(
         kwargs.setdefault("metadata_filename", file_filename)
 
     if filetype == FileType.DOC:
-        elements = partition_doc(filename=filename, file=file, **kwargs)
+        _partition_doc = _get_partition_with_extras("doc")
+        elements = _partition_doc(filename=filename, file=file, **kwargs)
     elif filetype == FileType.DOCX:
-        elements = partition_docx(filename=filename, file=file, **kwargs)
+        _partition_docx = _get_partition_with_extras("docx")
+        elements = _partition_docx(filename=filename, file=file, **kwargs)
     elif filetype == FileType.ODT:
-        elements = partition_odt(filename=filename, file=file, **kwargs)
+        _partition_odt = _get_partition_with_extras("odt")
+        elements = _partition_odt(filename=filename, file=file, **kwargs)
     elif filetype == FileType.EML:
         elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
     elif filetype == FileType.MSG:
-        elements = partition_msg(filename=filename, file=file, **kwargs)
+        _partition_msg = _get_partition_with_extras("msg")
+        elements = _partition_msg(filename=filename, file=file, **kwargs)
     elif filetype == FileType.HTML:
         elements = partition_html(
             filename=filename,
@@ -196,35 +245,40 @@ def partition(
             **kwargs,
         )
     elif filetype == FileType.EPUB:
-        elements = partition_epub(
+        _partition_epub = _get_partition_with_extras("epub")
+        elements = _partition_epub(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.ORG:
-        elements = partition_org(
+        _partition_org = _get_partition_with_extras("org")
+        elements = _partition_org(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.RST:
-        elements = partition_rst(
+        _partition_rst = _get_partition_with_extras("rst")
+        elements = _partition_rst(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.MD:
-        elements = partition_md(
+        _partition_md = _get_partition_with_extras("md")
+        elements = _partition_md(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.PDF:
-        elements = partition_pdf(
+        _partition_pdf = _get_partition_with_extras("pdf")
+        elements = _partition_pdf(
             filename=filename,  # type: ignore
             file=file,  # type: ignore
             url=None,
@@ -235,7 +289,8 @@ def partition(
             **kwargs,
         )
     elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
-        elements = partition_image(
+        _partition_image = _get_partition_with_extras("image")
+        elements = _partition_image(
             filename=filename,  # type: ignore
             file=file,  # type: ignore
             url=None,
@@ -254,21 +309,24 @@ def partition(
             **kwargs,
         )
     elif filetype == FileType.RTF:
-        elements = partition_rtf(
+        _partition_rtf = _get_partition_with_extras("rtf")
+        elements = _partition_rtf(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.PPT:
-        elements = partition_ppt(
+        _partition_ppt = _get_partition_with_extras("ppt")
+        elements = _partition_ppt(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
     elif filetype == FileType.PPTX:
-        elements = partition_pptx(
+        _partition_pptx = _get_partition_with_extras("pptx")
+        elements = _partition_pptx(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
@@ -282,11 +340,14 @@ def partition(
             )
         elements = partition_json(filename=filename, file=file, **kwargs)
     elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
-        elements = partition_xlsx(filename=filename, file=file, **kwargs)
+        _partition_xlsx = _get_partition_with_extras("xlsx")
+        elements = _partition_xlsx(filename=filename, file=file, **kwargs)
     elif filetype == FileType.CSV:
-        elements = partition_csv(filename=filename, file=file, **kwargs)
+        _partition_csv = _get_partition_with_extras("csv")
+        elements = _partition_csv(filename=filename, file=file, **kwargs)
     elif filetype == FileType.TSV:
-        elements = partition_tsv(filename=filename, file=file, **kwargs)
+        _partition_tsv = _get_partition_with_extras("tsv")
+        elements = _partition_tsv(filename=filename, file=file, **kwargs)
     elif filetype == FileType.EMPTY:
         elements = []
     else: