Skip to content

Commit

Permalink
chore: deprecation warning for file_filename (#1191)
Browse files Browse the repository at this point in the history
### Summary

Closes #1007. Adds a deprecation warning for the `file_filename` kwarg
to `partition`, `partition_via_api`, and `partition_multiple_via_api`.
Also catches a warning in `ebooklib` that we do not want to emit in
`unstructured`.

### Testing

```python
from unstructured.partition.auto import partition

filename = "example-docs/winter-sports.epub"

# Should not emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should emit a warning
with open(filename, "rb") as f:
    elements = partition(file=f, file_filename="test.epub")
# Should be test.epub
elements[0].metadata.filename

# Should raise an error
with open(filename, "rb") as f:
    elements = partition(file=f, metadata_filename="test.epub", file_filename="test.epub")
```
  • Loading branch information
MthwRobinson authored Aug 24, 2023
1 parent 835378a commit cdae53c
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 54 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
* Bump unstructured-inference
* Fix issue where temporary files were being left behind (0.5.16)
* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`,
and `partition_multiple_via_api`.
* Fix documentation build workflow by pinning dependencies

## 0.10.5
Expand Down
4 changes: 2 additions & 2 deletions docs/source/bricks/partition.rst
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ Examples:
with ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
documents = partition_multiple_via_api(files=files, file_filenames=filenames)
documents = partition_multiple_via_api(files=files, metadata_filenames=filenames)
For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/partition/api.py>`_.

Expand Down Expand Up @@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer
elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822")
with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY")
You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the
Expand Down
77 changes: 74 additions & 3 deletions test_unstructured/partition/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)

with open(filename, "rb") as f:
elements = partition_via_api(file=f, file_filename=filename)
elements = partition_via_api(file=f, metadata_filename=filename)
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"


def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)

with open(filename, "rb") as f:
partition_via_api(file=f, file_filename=filename)

assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text


def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)

with open(filename, "rb") as f, pytest.raises(ValueError):
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)


def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
monkeypatch.setattr(
requests,
Expand Down Expand Up @@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch):
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
elements = partition_multiple_via_api(
files=files,
file_filenames=filenames,
metadata_filenames=filenames,
)
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"


def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)

filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]

with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
partition_multiple_via_api(
files=files,
file_filenames=filenames,
)
assert "WARNING" in caplog.text
assert "The file_filenames kwarg will be deprecated" in caplog.text


def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)

filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]

with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
file_filenames=filenames,
)


def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
monkeypatch.setattr(
requests,
Expand Down Expand Up @@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
file_filenames=filenames,
metadata_filenames=filenames,
content_types=["text/plain"],
)

Expand Down
82 changes: 49 additions & 33 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
mock_docx_document,
expected_docx_elements,
tmpdir,
pass_file_filename,
pass_metadata_filename,
content_type,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
file_filename = doc_filename if pass_file_filename else None
metadata_filename = doc_filename if pass_metadata_filename else None
elements = partition(
filename=doc_filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
Expand All @@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
def test_auto_partition_html_from_filename(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
Expand All @@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type):


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_file_filename, content_type):
def test_auto_partition_html_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
with open(filename) as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
Expand Down Expand Up @@ -285,16 +285,16 @@ def test_auto_partition_text_from_file():


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request):
def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None

elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
Expand Down Expand Up @@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):

mock_partition.assert_called_once_with(
filename=filename,
metadata_filename=None,
file=None,
url=None,
include_page_breaks=False,
Expand All @@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request):
def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None

with open(filename, "rb") as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="hi_res",
)
Expand All @@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning():


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
Expand All @@ -399,32 +400,32 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_file_filename, content_type):
def test_auto_partition_jpg(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
assert len(elements) > 0


@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
metadata_filename = filename if pass_metadata_filename else None
with open(filename, "rb") as f:
elements = partition(
file=f,
file_filename=file_filename,
metadata_filename=metadata_filename,
content_type=content_type,
strategy="auto",
)
Expand Down Expand Up @@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"):
assert elements[0].metadata.filetype == "text/x-rst"


def test_auto_partition_metadata_file_filename():
def test_auto_partition_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f:
elements = partition(file=f, metadata_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]


def test_auto_partition_warns_about_file_filename_deprecation(caplog):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f:
elements = partition(file=f, file_filename=filename)
assert elements[0].metadata.filename == os.path.split(filename)[-1]
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text


def test_auto_partition_raises_with_file_and_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
with open(filename) as f, pytest.raises(ValueError):
partition(file=f, file_filename=filename, metadata_filename=filename)


def test_get_partition_with_extras_prompts_for_install_if_missing():
Expand Down
Loading

0 comments on commit cdae53c

Please sign in to comment.