From 1ddf542e1432b7d82b1748ec4b49992f3ab28022 Mon Sep 17 00:00:00 2001 From: Charles Date: Wed, 23 Aug 2023 03:43:33 +0100 Subject: [PATCH] fix: Don't call extractable_elements if strategy is ocr_only (#1160) - fixes #1079 where partitioning is happening twice in the case of `strategy="ocr_only"` - only calls `extractable_elements` if we can predetermine that `ocr_only` is not a possible strategy even if it was the intended strategy. - Adds additional assertion test that `_partition_pdf_or_image_with_ocr` is not called when falling back to `fast` from `ocr_only` --- CHANGELOG.md | 2 ++ test_unstructured/partition/pdf-image/test_pdf.py | 6 +++++- unstructured/partition/pdf.py | 14 +++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d95c0a8fe2..b1dab9fdc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixes +* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`. + ## 0.10.5 ### Enhancements diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 49a64ef2e7..60bceead19 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -325,10 +325,14 @@ def mock_exists(dep): pdf, "extractable_elements", return_value=mock_return, - ) as mock_partition: + ) as mock_partition, mock.patch.object( + pdf, + "_partition_pdf_or_image_with_ocr", + ) as mock_partition_ocr: pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only") mock_partition.assert_called_once() + mock_partition_ocr.assert_not_called() assert "pytesseract is not installed" in caplog.text diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0199b98e86..588ca11078 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -155,7 +155,18 @@ def partition_pdf_or_image( file=file, filename=filename, ) - if not is_image: + + if ( + not is_image + and determine_pdf_or_image_strategy( + strategy, + filename=filename, + file=file, + is_image=is_image, + infer_table_structure=infer_table_structure, + ) + != "ocr_only" + ): extracted_elements = extractable_elements( filename=filename, file=spooled_to_bytes_io_if_needed(file), @@ -209,6 +220,7 @@ def partition_pdf_or_image( min_partition=min_partition, metadata_last_modified=metadata_last_modified or last_modification_date, ) + return layout_elements