Skip to content

Commit

Permalink
revert pdf changes and add new pdf for empty page testing (#1255)
Browse files Browse the repository at this point in the history
- revert the layout parser fast pdf file to original with just two pages
- add a new file that has one empty page and one page says "this page is
intentionally left blank" for tests
  • Loading branch information
badGarnet authored Sep 1, 2023
1 parent fc9d251 commit 1a0b737
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 4 deletions.
Binary file modified example-docs/layout-parser-paper-fast.pdf
Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_partition_pdf(
file_mode,
strategy,
expected,
filename="example-docs/layout-parser-paper-fast.pdf",
filename="example-docs/layout-parser-paper-with-empty-pages.pdf",
):
# Test that the partition_pdf function can handle filename
def _test(result):
Expand Down Expand Up @@ -219,7 +219,7 @@ def test_partition_pdf_with_fast_strategy(
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in elements} == {1, 4}
assert {element.metadata.page_number for element in elements} == {1, 2}
for element in elements:
assert element.metadata.filename == "layout-parser-paper-fast.pdf"

Expand Down
5 changes: 3 additions & 2 deletions test_unstructured_ingest/test-ingest-against-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--reprocess \
--structured-output-dir "$OUTPUT_DIR" \
--verbose \
--file-glob "*.pdf" \
--num-processes 1 \
--file-glob "*1p.txt" \
--input-path example-docs

sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME

0 comments on commit 1a0b737

Please sign in to comment.