diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c99582fb6..29b5645fed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ * Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`. * Bump unstructured-inference * Fix issue where temporary files were being left behind (0.5.16) +* Adds deprecation warning for the `file_filename` kwarg to `partition`, `partition_via_api`, + and `partition_multiple_via_api`. * Fix documentation build workflow by pinning dependencies ## 0.10.5 diff --git a/docs/source/bricks/partition.rst b/docs/source/bricks/partition.rst index d074d1e3bb..68177898ba 100644 --- a/docs/source/bricks/partition.rst +++ b/docs/source/bricks/partition.rst @@ -496,7 +496,7 @@ Examples: with ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] - documents = partition_multiple_via_api(files=files, file_filenames=filenames) + documents = partition_multiple_via_api(files=files, metadata_filenames=filenames) For more information about the ``partition_multiple_via_api`` brick, you can check the `source code here `_. @@ -794,7 +794,7 @@ type for the file. If you do not explicitly pass it, the MIME type will be infer elements = partition_via_api(filename=filename, api_key="MY_API_KEY", content_type="message/rfc822") with open(filename, "rb") as f: - elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY") + elements = partition_via_api(file=f, metadata_filename=filename, api_key="MY_API_KEY") You can pass additional settings such as ``strategy``, ``ocr_languages`` and ``encoding`` to the diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index d74589f2fb..e4b128483d 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -67,11 +67,38 @@ def test_partition_via_api_from_file(monkeypatch): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) with open(filename, "rb") as f: - elements = partition_via_api(file=f, file_filename=filename) + elements = partition_via_api(file=f, metadata_filename=filename) assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" +def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockResponse(status_code=200), + ) + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) + + with open(filename, "rb") as f: + partition_via_api(file=f, file_filename=filename) + + assert "WARNING" in caplog.text + assert "The file_filename kwarg will be deprecated" in caplog.text + + +def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockResponse(status_code=200), + ) + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) + + with open(filename, "rb") as f, pytest.raises(ValueError): + partition_via_api(file=f, file_filename=filename, metadata_filename=filename) + + def test_partition_via_api_from_file_raises_without_filename(monkeypatch): monkeypatch.setattr( requests, @@ -246,13 +273,57 @@ def test_partition_multiple_via_api_from_files(monkeypatch): files = [stack.enter_context(open(filename, "rb")) for filename in filenames] elements = partition_multiple_via_api( files=files, - file_filenames=filenames, + metadata_filenames=filenames, ) assert len(elements) == 2 assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0].metadata.filetype == "message/rfc822" +def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + partition_multiple_via_api( + files=files, + file_filenames=filenames, + ) + assert "WARNING" in caplog.text + assert "The file_filenames kwarg will be deprecated" in caplog.text + + +def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + with pytest.raises(ValueError): + partition_multiple_via_api( + files=files, + metadata_filenames=filenames, + file_filenames=filenames, + ) + + def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): monkeypatch.setattr( requests, @@ -305,7 +376,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp with pytest.raises(ValueError): partition_multiple_via_api( files=files, - file_filenames=filenames, + metadata_filenames=filenames, content_types=["text/plain"], ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index efe846624d..4fb6309f0f 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -118,24 +118,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], ) def test_auto_partition_doc_with_filename( mock_docx_document, expected_docx_elements, tmpdir, - pass_file_filename, + pass_metadata_filename, content_type, ): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_docx_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") - file_filename = doc_filename if pass_file_filename else None + metadata_filename = doc_filename if pass_metadata_filename else None elements = partition( filename=doc_filename, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="hi_res", ) @@ -159,15 +159,15 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) -def test_auto_partition_html_from_filename(pass_file_filename, content_type): +def test_auto_partition_html_from_filename(pass_metadata_filename, content_type): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None elements = partition( filename=filename, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="hi_res", ) @@ -177,16 +177,16 @@ def test_auto_partition_html_from_filename(pass_file_filename, content_type): @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) -def test_auto_partition_html_from_file(pass_file_filename, content_type): +def test_auto_partition_html_from_file(pass_metadata_filename, content_type): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None with open(filename) as f: elements = partition( file=f, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="hi_res", ) @@ -285,16 +285,16 @@ def test_auto_partition_text_from_file(): @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) -def test_auto_partition_pdf_from_filename(pass_file_filename, content_type, request): +def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type, request): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None elements = partition( filename=filename, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="hi_res", ) @@ -332,6 +332,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch): mock_partition.assert_called_once_with( filename=filename, + metadata_filename=None, file=None, url=None, include_page_breaks=False, @@ -342,17 +343,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch): @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) -def test_auto_partition_pdf_from_file(pass_file_filename, content_type, request): +def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, request): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None with open(filename, "rb") as f: elements = partition( file=f, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="hi_res", ) @@ -379,15 +380,15 @@ def test_partition_pdf_doesnt_raise_warning(): @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) -def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type): +def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, content_type): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None elements = partition( filename=filename, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="auto", ) @@ -399,15 +400,15 @@ def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, conten @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) -def test_auto_partition_jpg(pass_file_filename, content_type): +def test_auto_partition_jpg(pass_metadata_filename, content_type): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None elements = partition( filename=filename, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="auto", ) @@ -415,16 +416,16 @@ def test_auto_partition_jpg(pass_file_filename, content_type): @pytest.mark.parametrize( - ("pass_file_filename", "content_type"), + ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) -def test_auto_partition_jpg_from_file(pass_file_filename, content_type): +def test_auto_partition_jpg_from_file(pass_metadata_filename, content_type): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - file_filename = filename if pass_file_filename else None + metadata_filename = filename if pass_metadata_filename else None with open(filename, "rb") as f: elements = partition( file=f, - file_filename=file_filename, + metadata_filename=metadata_filename, content_type=content_type, strategy="auto", ) @@ -874,11 +875,26 @@ def test_auto_partition_rst_from_file(filename="example-docs/README.rst"): assert elements[0].metadata.filetype == "text/x-rst" -def test_auto_partition_metadata_file_filename(): +def test_auto_partition_metadata_filename(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") + with open(filename) as f: + elements = partition(file=f, metadata_filename=filename) + assert elements[0].metadata.filename == os.path.split(filename)[-1] + + +def test_auto_partition_warns_about_file_filename_deprecation(caplog): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") with open(filename) as f: elements = partition(file=f, file_filename=filename) assert elements[0].metadata.filename == os.path.split(filename)[-1] + assert "WARNING" in caplog.text + assert "The file_filename kwarg will be deprecated" in caplog.text + + +def test_auto_partition_raises_with_file_and_metadata_filename(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") + with open(filename) as f, pytest.raises(ValueError): + partition(file=f, file_filename=filename, metadata_filename=filename) def test_get_partition_with_extras_prompts_for_install_if_missing(): diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index 1064c06214..a4f7b91521 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -8,6 +8,7 @@ import requests from unstructured.documents.elements import Element +from unstructured.logger import logger from unstructured.partition.common import exactly_one from unstructured.staging.base import dict_to_elements, elements_from_json @@ -19,6 +20,7 @@ def partition_via_api( file_filename: Optional[str] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", + metadata_filename: Optional[str] = None, **request_kwargs, ) -> List[Element]: """Partitions a document using the Unstructured REST API. This is equivalent to @@ -36,7 +38,7 @@ def partition_via_api( A string defining the file content in MIME type file A file-like object using "rb" mode --> open(filename, "rb"). - file_filename + metadata_filename When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" api_url The URL for the Unstructured API. Defaults to the hosted Unstructured API. @@ -48,6 +50,19 @@ def partition_via_api( """ exactly_one(filename=filename, file=file) + if metadata_filename and file_filename: + raise ValueError( + "Only one of metadata_filename and file_filename is specified. " + "metadata_filename is preferred. file_filename is marked for deprecation.", + ) + + if file_filename is not None: + metadata_filename = file_filename + logger.warn( + "The file_filename kwarg will be deprecated in a future version of unstructured. " + "Please use metadata_filename instead.", + ) + headers = { "ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": api_key, @@ -65,13 +80,13 @@ def partition_via_api( files=files, # type: ignore ) elif file is not None: - if file_filename is None: + if metadata_filename is None: raise ValueError( "If file is specified in partition_via_api, " - "file_filename must be specified as well.", + "metadata_filename must be specified as well.", ) files = [ - ("files", (file_filename, file, content_type)), # type: ignore + ("files", (metadata_filename, file, content_type)), # type: ignore ] response = requests.post( api_url, @@ -95,6 +110,7 @@ def partition_multiple_via_api( file_filenames: Optional[List[str]] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", + metadata_filenames: Optional[List[str]] = None, **request_kwargs, ) -> List[List[Element]]: """Partitions multiple document using the Unstructured REST API by batching @@ -112,7 +128,7 @@ def partition_multiple_via_api( A list of strings defining the file contents in MIME types. files A list of file-like object using "rb" mode --> open(filename, "rb"). - file_filename + metadata_filename When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" api_url The URL for the Unstructured API. Defaults to the hosted Unstructured API. @@ -127,6 +143,19 @@ def partition_multiple_via_api( "UNSTRUCTURED-API-KEY": api_key, } + if metadata_filenames and file_filenames: + raise ValueError( + "Only one of metadata_filenames and file_filenames is specified. " + "metadata_filenames is preferred. file_filenames is marked for deprecation.", + ) + + if file_filenames is not None: + metadata_filenames = file_filenames + logger.warn( + "The file_filenames kwarg will be deprecated in a future version of unstructured. " + "Please use metadata_filenames instead.", + ) + if filenames is not None: if content_types and len(content_types) != len(filenames): raise ValueError("content_types and filenames must have the same length.") @@ -151,15 +180,15 @@ def partition_multiple_via_api( if content_types and len(content_types) != len(files): raise ValueError("content_types and files must have the same length.") - if not file_filenames: - raise ValueError("file_filenames must be specified if files are passed") - elif len(file_filenames) != len(files): - raise ValueError("file_filenames and files must have the same length.") + if not metadata_filenames: + raise ValueError("metadata_filenames must be specified if files are passed") + elif len(metadata_filenames) != len(files): + raise ValueError("metadata_filenames and files must have the same length.") _files = [] for i, _file in enumerate(files): # type: ignore content_type = content_types[i] if content_types is not None else None - filename = file_filenames[i] + filename = metadata_filenames[i] _files.append(("files", (filename, _file, content_type))) response = requests.post( diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index d04f78da0d..838ec5eacf 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -132,6 +132,7 @@ def partition( pdf_infer_table_structure: bool = False, xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, + metadata_filename: Optional[str] = None, **kwargs, ): """Partitions a document into its constituent elements. Will use libmagic to determine @@ -147,7 +148,7 @@ def partition( A string defining the file content in MIME type file A file-like object using "rb" mode --> open(filename, "rb"). - file_filename + metadata_filename When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" url The url for a remote document. Pass in content_type if you want partition to treat @@ -181,6 +182,20 @@ def partition( """ exactly_one(file=file, filename=filename, url=url) + if metadata_filename and file_filename: + raise ValueError( + "Only one of metadata_filename and file_filename is specified. " + "metadata_filename is preferred. file_filename is marked for deprecation.", + ) + + if file_filename is not None: + metadata_filename = file_filename + logger.warn( + "The file_filename kwarg will be deprecated in a future version of unstructured. " + "Please use metadata_filename instead.", + ) + kwargs.setdefault("metadata_filename", metadata_filename) + if url is not None: file, filetype = file_and_type_from_url( url=url, @@ -197,7 +212,7 @@ def partition( filetype = detect_filetype( filename=filename, file=file, - file_filename=file_filename, + file_filename=metadata_filename, content_type=content_type, encoding=encoding, ) @@ -211,9 +226,6 @@ def partition( pdf_infer_table_structure, ) - if file is not None and file_filename is not None: - kwargs.setdefault("metadata_filename", file_filename) - if filetype == FileType.DOC: _partition_doc = _get_partition_with_extras("doc") elements = _partition_doc(filename=filename, file=file, **kwargs) diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 9a108bbc8f..5dd4d6afdd 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -1,4 +1,5 @@ import tempfile +import warnings from typing import IO, List, Optional from ebooklib import epub @@ -52,7 +53,11 @@ def partition_epub( filename = tmp.name last_modification_date = get_last_modified_date_from_file(file) - book = epub.read_epub(filename, options={"ignore_ncx": False}) + # NOTE(robinson): ignore ebooklib warning about changing the ignore_ncx default + # in the future. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + book = epub.read_epub(filename, options={"ignore_ncx": False}) # book.items also includes EpubLink, EpubImage, EpubNcx (page navigation info) # and EpubItem (fomatting/css) html_items = [item for item in book.items if isinstance(item, epub.EpubHtml)]