diff --git a/example-docs/layout-parser-paper-fast.pdf b/example-docs/layout-parser-paper-fast.pdf index 98476d4e2d..6a3180631d 100644 Binary files a/example-docs/layout-parser-paper-fast.pdf and b/example-docs/layout-parser-paper-fast.pdf differ diff --git a/example-docs/layout-parser-paper-with-empty-pages.pdf b/example-docs/layout-parser-paper-with-empty-pages.pdf new file mode 100644 index 0000000000..98476d4e2d Binary files /dev/null and b/example-docs/layout-parser-paper-with-empty-pages.pdf differ diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index d2e6633f7d..2e6fbccff7 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -121,7 +121,7 @@ def test_partition_pdf( file_mode, strategy, expected, - filename="example-docs/layout-parser-paper-fast.pdf", + filename="example-docs/layout-parser-paper-with-empty-pages.pdf", ): # Test that the partition_pdf function can handle filename def _test(result): @@ -219,7 +219,7 @@ def test_partition_pdf_with_fast_strategy( elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast") assert len(elements) > 10 # check that the pdf has multiple different page numbers - assert {element.metadata.page_number for element in elements} == {1, 4} + assert {element.metadata.page_number for element in elements} == {1, 2} for element in elements: assert element.metadata.filename == "layout-parser-paper-fast.pdf" diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index 176737dea0..2fd85e329b 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -20,7 +20,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --reprocess \ --structured-output-dir "$OUTPUT_DIR" \ --verbose \ - --file-glob "*.pdf" \ + --num-processes 1 \ + --file-glob "*1p.txt" \ --input-path example-docs -sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME +sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME