Skip to content

Commit

Permalink
enhancement: tell users to install missing extras (#1167)
Browse files Browse the repository at this point in the history
### Summary

Updates `partition` to let users know to installs the appropriate extras
if they're missing. Prior to this PR, users would get an exception
stating `partition_pdf` (or whichever function that requires extras)
does not exist.

### Testing

First `pip uninstall ebooklib`. Then run

```python
from unstructured.partition.auto import partition

partition(filename="example-docs/winter-sports.epub")
```

The error should look like

```python
ImportError: partition_epub is not available. Install the epub dependencies with pip install "unstructured[epub]"
```
  • Loading branch information
MthwRobinson authored Aug 22, 2023
1 parent f639d04 commit ad595d3
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 24 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
## 0.10.5-dev1
## 0.10.5-dev2

### Enhancements
* Create new CI Pipelines
- Checking text, xml, email, and html doc tests against the library installed without extras
- Checking each library extra against their respective tests
* `partition` raises and error and tells the user to install the appropriate extra if a filetype
is detected that is missing dependencies.

## 0.10.3
* Adds ability to reuse connections per process in unstructured-ingest
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List


csv_reqs = load_requirements("requirements/extra-csv.in")
doc_reqs = load_requirements("requirements/extra-docx.in")
docx_reqs = load_requirements("requirements/extra-docx.in")
epub_reqs = load_requirements("requirements/extra-epub.in")
image_reqs = load_requirements("requirements/extra-pdf-image.in")
Expand All @@ -48,6 +49,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
odt_reqs = load_requirements("requirements/extra-odt.in")
org_reqs = load_requirements("requirements/extra-pandoc.in")
pdf_reqs = load_requirements("requirements/extra-pdf-image.in")
ppt_reqs = load_requirements("requirements/extra-pptx.in")
pptx_reqs = load_requirements("requirements/extra-pptx.in")
rtf_reqs = load_requirements("requirements/extra-pandoc.in")
rst_reqs = load_requirements("requirements/extra-pandoc.in")
Expand Down Expand Up @@ -109,6 +111,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
# Document specific extra requirements
"all-docs": all_doc_reqs,
"csv": csv_reqs,
"doc": doc_reqs,
"docx": docx_reqs,
"epub": epub_reqs,
"image": image_reqs,
Expand All @@ -117,6 +120,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
"odt": odt_reqs,
"org": org_reqs,
"pdf": pdf_reqs,
"ppt": ppt_reqs,
"pptx": pptx_reqs,
"rtf": rtf_reqs,
"rst": rst_reqs,
Expand Down
27 changes: 21 additions & 6 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, FileType
from unstructured.partition import auto
from unstructured.partition.auto import partition
from unstructured.partition.auto import _get_partition_with_extras, partition
from unstructured.partition.common import convert_office_doc
from unstructured.staging.base import elements_to_json

Expand Down Expand Up @@ -321,11 +321,13 @@ def test_auto_partition_pdf_uses_table_extraction():
assert mock_process_file_with_model.call_args[1]["extract_tables"]


def test_auto_partition_pdf_with_fast_strategy():
def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")

mock_return = [NarrativeText("Hello there!")]
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
mock_partition_with_extras_map = {"pdf": mock_partition}
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
partition(filename=filename, strategy="fast")

mock_partition.assert_called_once_with(
Expand Down Expand Up @@ -563,11 +565,13 @@ def test_auto_partition_odt_from_file():
("jdsfjdfsjkds", "pdf", None),
],
)
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected, monkeypatch):
with patch(
f"unstructured.partition.auto.partition_{routing_func}",
lambda *args, **kwargs: [Text("text 1"), Text("text 2")],
):
) as mock_partition:
mock_partition_with_extras_map = {routing_func: mock_partition}
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
assert len(elements) == 2
assert all(el.metadata.filetype == expected for el in elements)
Expand All @@ -580,15 +584,17 @@ def test_auto_adds_filetype_to_metadata(content_type, routing_func, expected):
(None, FILETYPE_TO_MIMETYPE[FileType.PDF]),
],
)
def test_auto_filetype_overrides_file_specific(content_type, expected):
def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypatch):
pdf_metadata = ElementMetadata(filetype="imapdf")
with patch(
"unstructured.partition.auto.partition_pdf",
lambda *args, **kwargs: [
Text("text 1", metadata=pdf_metadata),
Text("text 2", metadata=pdf_metadata),
],
):
) as mock_partition:
mock_partition_with_extras_map = {"pdf": mock_partition}
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
assert len(elements) == 2
assert all(el.metadata.filetype == expected for el in elements)
Expand Down Expand Up @@ -873,3 +879,12 @@ def test_auto_partition_metadata_file_filename():
with open(filename) as f:
elements = partition(file=f, file_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]


def test_get_partition_with_extras_prompts_for_install_if_missing():
partition_with_extras_map = {}
with pytest.raises(ImportError) as exception_info:
_get_partition_with_extras("pdf", partition_with_extras_map)

msg = str(exception_info.value)
assert 'Install the pdf dependencies with pip install "unstructured[pdf]"' in msg
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.5-dev1" # pragma: no cover
__version__ = "0.10.5-dev2" # pragma: no cover
93 changes: 77 additions & 16 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,55 +20,100 @@
from unstructured.partition.xml import partition_xml
from unstructured.utils import dependency_exists

PARTITION_WITH_EXTRAS_MAP: Dict[str, Callable] = {}

if dependency_exists("pandas"):
from unstructured.partition.csv import partition_csv
from unstructured.partition.tsv import partition_tsv

PARTITION_WITH_EXTRAS_MAP["csv"] = partition_csv
PARTITION_WITH_EXTRAS_MAP["tsv"] = partition_tsv


if dependency_exists("docx"):
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx

PARTITION_WITH_EXTRAS_MAP["doc"] = partition_doc
PARTITION_WITH_EXTRAS_MAP["docx"] = partition_docx


if dependency_exists("docx") and dependency_exists("pypandoc"):
from unstructured.partition.odt import partition_odt

PARTITION_WITH_EXTRAS_MAP["odt"] = partition_odt


if dependency_exists("ebooklib"):
from unstructured.partition.epub import partition_epub

PARTITION_WITH_EXTRAS_MAP["epub"] = partition_epub


if dependency_exists("pypandoc"):
from unstructured.partition.org import partition_org
from unstructured.partition.rst import partition_rst
from unstructured.partition.rtf import partition_rtf

PARTITION_WITH_EXTRAS_MAP["org"] = partition_org
PARTITION_WITH_EXTRAS_MAP["rst"] = partition_rst
PARTITION_WITH_EXTRAS_MAP["rtf"] = partition_rtf


if dependency_exists("markdown"):
from unstructured.partition.md import partition_md

PARTITION_WITH_EXTRAS_MAP["md"] = partition_md


if dependency_exists("msg_parser"):
from unstructured.partition.msg import partition_msg

PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg


pdf_imports = ["pdf2image", "pdfminer", "PIL"]
if all(dependency_exists(dep) for dep in pdf_imports):
from unstructured.partition.pdf import partition_pdf

PARTITION_WITH_EXTRAS_MAP["pdf"] = partition_pdf


if dependency_exists("unstructured_inference"):
from unstructured.partition.image import partition_image

PARTITION_WITH_EXTRAS_MAP["image"] = partition_image


if dependency_exists("pptx"):
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx

PARTITION_WITH_EXTRAS_MAP["ppt"] = partition_ppt
PARTITION_WITH_EXTRAS_MAP["pptx"] = partition_pptx


if dependency_exists("pandas") and dependency_exists("openpyxl"):
from unstructured.partition.xlsx import partition_xlsx

PARTITION_WITH_EXTRAS_MAP["xlsx"] = partition_xlsx


def _get_partition_with_extras(
doc_type: str,
partition_with_extras_map: Optional[Dict[str, Callable]] = None,
):
if partition_with_extras_map is None:
partition_with_extras_map = PARTITION_WITH_EXTRAS_MAP
_partition_func = partition_with_extras_map.get(doc_type)
if _partition_func is None:
raise ImportError(
f"partition_{doc_type} is not available. "
f"Install the {doc_type} dependencies with "
f'pip install "unstructured[{doc_type}]"',
)
return _partition_func


def partition(
filename: Optional[str] = None,
Expand Down Expand Up @@ -170,15 +215,19 @@ def partition(
kwargs.setdefault("metadata_filename", file_filename)

if filetype == FileType.DOC:
elements = partition_doc(filename=filename, file=file, **kwargs)
_partition_doc = _get_partition_with_extras("doc")
elements = _partition_doc(filename=filename, file=file, **kwargs)
elif filetype == FileType.DOCX:
elements = partition_docx(filename=filename, file=file, **kwargs)
_partition_docx = _get_partition_with_extras("docx")
elements = _partition_docx(filename=filename, file=file, **kwargs)
elif filetype == FileType.ODT:
elements = partition_odt(filename=filename, file=file, **kwargs)
_partition_odt = _get_partition_with_extras("odt")
elements = _partition_odt(filename=filename, file=file, **kwargs)
elif filetype == FileType.EML:
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
elif filetype == FileType.MSG:
elements = partition_msg(filename=filename, file=file, **kwargs)
_partition_msg = _get_partition_with_extras("msg")
elements = _partition_msg(filename=filename, file=file, **kwargs)
elif filetype == FileType.HTML:
elements = partition_html(
filename=filename,
Expand All @@ -196,35 +245,40 @@ def partition(
**kwargs,
)
elif filetype == FileType.EPUB:
elements = partition_epub(
_partition_epub = _get_partition_with_extras("epub")
elements = _partition_epub(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.ORG:
elements = partition_org(
_partition_org = _get_partition_with_extras("org")
elements = _partition_org(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.RST:
elements = partition_rst(
_partition_rst = _get_partition_with_extras("rst")
elements = _partition_rst(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.MD:
elements = partition_md(
_partition_md = _get_partition_with_extras("md")
elements = _partition_md(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PDF:
elements = partition_pdf(
_partition_pdf = _get_partition_with_extras("pdf")
elements = _partition_pdf(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
Expand All @@ -235,7 +289,8 @@ def partition(
**kwargs,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
elements = partition_image(
_partition_image = _get_partition_with_extras("image")
elements = _partition_image(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
Expand All @@ -254,21 +309,24 @@ def partition(
**kwargs,
)
elif filetype == FileType.RTF:
elements = partition_rtf(
_partition_rtf = _get_partition_with_extras("rtf")
elements = _partition_rtf(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PPT:
elements = partition_ppt(
_partition_ppt = _get_partition_with_extras("ppt")
elements = _partition_ppt(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
**kwargs,
)
elif filetype == FileType.PPTX:
elements = partition_pptx(
_partition_pptx = _get_partition_with_extras("pptx")
elements = _partition_pptx(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
Expand All @@ -282,11 +340,14 @@ def partition(
)
elements = partition_json(filename=filename, file=file, **kwargs)
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
elements = partition_xlsx(filename=filename, file=file, **kwargs)
_partition_xlsx = _get_partition_with_extras("xlsx")
elements = _partition_xlsx(filename=filename, file=file, **kwargs)
elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file, **kwargs)
_partition_csv = _get_partition_with_extras("csv")
elements = _partition_csv(filename=filename, file=file, **kwargs)
elif filetype == FileType.TSV:
elements = partition_tsv(filename=filename, file=file, **kwargs)
_partition_tsv = _get_partition_with_extras("tsv")
elements = _partition_tsv(filename=filename, file=file, **kwargs)
elif filetype == FileType.EMPTY:
elements = []
else:
Expand Down

0 comments on commit ad595d3

Please sign in to comment.