diff --git a/CHANGELOG.md b/CHANGELOG.md index d95c0a8fe2..b1dab9fdc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixes +* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`. + ## 0.10.5 ### Enhancements diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 49a64ef2e7..60bceead19 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -325,10 +325,14 @@ def mock_exists(dep): pdf, "extractable_elements", return_value=mock_return, - ) as mock_partition: + ) as mock_partition, mock.patch.object( + pdf, + "_partition_pdf_or_image_with_ocr", + ) as mock_partition_ocr: pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only") mock_partition.assert_called_once() + mock_partition_ocr.assert_not_called() assert "pytesseract is not installed" in caplog.text diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0199b98e86..588ca11078 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -155,7 +155,18 @@ def partition_pdf_or_image( file=file, filename=filename, ) - if not is_image: + + if ( + not is_image + and determine_pdf_or_image_strategy( + strategy, + filename=filename, + file=file, + is_image=is_image, + infer_table_structure=infer_table_structure, + ) + != "ocr_only" + ): extracted_elements = extractable_elements( filename=filename, file=spooled_to_bytes_io_if_needed(file), @@ -209,6 +220,7 @@ def partition_pdf_or_image( min_partition=min_partition, metadata_last_modified=metadata_last_modified or last_modification_date, ) + return layout_elements