From b39e0d735471f751809c79ae54ea5fd58e3c133b Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:26:06 -0400 Subject: [PATCH] Roman/expose dpi param (#966) * Bump inference version * Pass through the dpi param if available * Update CHANGELOG * Check dpi param passed in via unit test * Bump inference version * Fix unit test around file info to work on mac as well --- CHANGELOG.md | 3 ++- requirements/local-inference.in | 2 +- requirements/local-inference.txt | 2 +- test_unstructured/file_utils/test_exploration.py | 2 +- test_unstructured/partition/test_pdf.py | 14 ++++++++++++++ unstructured/partition/pdf.py | 14 ++++++++++---- 6 files changed, 29 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f50c7bfb4..2e0a3e54ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,13 @@ ### Features * Adds Outlook connector +* Add support for dpi parameter in inference library ### Fixes * Fixes issue with email partitioning where From field was being assigned the To field value. -## 0.8.2-dev7 +## 0.8.2 ### Enhancements diff --git a/requirements/local-inference.in b/requirements/local-inference.in index a7d1bd2153..9ac43c9a72 100644 --- a/requirements/local-inference.in +++ b/requirements/local-inference.in @@ -1,3 +1,3 @@ -c constraints.in -c base.txt -unstructured-inference==0.5.5 +unstructured-inference==0.5.7 diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt index 7ef9ea552b..860c7b2cc7 100644 --- a/requirements/local-inference.txt +++ b/requirements/local-inference.txt @@ -215,7 +215,7 @@ tzdata==2023.3 # via # -c requirements/base.txt # pandas -unstructured-inference==0.5.5 +unstructured-inference==0.5.7 # via -r requirements/local-inference.in urllib3==1.26.16 # via diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py index 44785992d6..d1e052ae94 100644 --- a/test_unstructured/file_utils/test_exploration.py +++ b/test_unstructured/file_utils/test_exploration.py @@ -39,7 +39,7 @@ def test_get_directory_file_info(tmpdir): assert isinstance(file_info, pd.DataFrame) assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} - means = file_info.groupby("filetype").mean() + means = file_info.groupby("filetype").mean(numeric_only=True) assert means.columns.to_list() == ["filesize"] diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index b4d7c1a922..dfe42eb6b3 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -384,6 +384,20 @@ def test_partition_pdf_with_copy_protection(): assert {element.metadata.page_number for element in elements} == {1, 2} +def test_partition_pdf_with_dpi(): + filename = os.path.join("example-docs", "copy-protected.pdf") + with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: + pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100) + mock_process.assert_called_once_with( + filename, + is_image=False, + ocr_languages="eng", + extract_tables=False, + model_name=None, + pdf_image_dpi=100, + ) + + def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"): elements = pdf.partition_pdf(filename=filename, strategy="fast") assert len(elements) > 50 diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d416c76b22..0baecc53d1 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -203,12 +203,18 @@ def _partition_pdf_or_image_local( model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME") if file is None: + pdf_image_dpi = kwargs.pop("pdf_image_dpi", None) + process_file_with_model_kwargs = { + "is_image": is_image, + "ocr_languages": ocr_languages, + "extract_tables": infer_table_structure, + "model_name": model_name, + } + if pdf_image_dpi: + process_file_with_model_kwargs["pdf_image_dpi"] = pdf_image_dpi layout = process_file_with_model( filename, - is_image=is_image, - ocr_languages=ocr_languages, - extract_tables=infer_table_structure, - model_name=model_name, + **process_file_with_model_kwargs, ) else: layout = process_data_with_model(