Skip to content

Commit

Permalink
Fix/1209 tweak xycut ordering output (#1630)
Browse files Browse the repository at this point in the history
Closes GH Issue #1209.

### Summary
- add swapped `xycut` sorting
- update `xycut` sorting evaluation script

PDFs:
-
[sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf)
-
[multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf)
-
[11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf)
### Testing
```
elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res")
print("\n\n".join([str(el) for el in elements]))
```
### Evaluation
```
PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only
```
  • Loading branch information
christinestraub authored Oct 5, 2023
1 parent 6d8572d commit b30d6a6
Show file tree
Hide file tree
Showing 24 changed files with 1,864 additions and 1,715 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

### Fixes

* **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.

## 0.10.19

### Enhancements
Expand Down
29 changes: 18 additions & 11 deletions examples/custom-layout-order/evaluate_xy_cut_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def draw_elements(elements, images, output_type, output_dir, base_name, label):
def run_partition_pdf(
pdf_path,
strategy,
scope,
images,
output_type="plot",
output_root_dir="",
Expand All @@ -113,13 +114,14 @@ def run_partition_pdf(
output_dir = os.path.join(output_root_dir, strategy, f_base_name)
os.makedirs(output_dir, exist_ok=True)

original_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_BASIC,
)
draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")
if scope == "all":
original_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_BASIC,
)
draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")

ordered_elements = partition_pdf(
filename=pdf_path,
Expand All @@ -134,22 +136,27 @@ def run_partition_pdf(
def run():
f_sub_path = sys.argv[1]
strategy = sys.argv[2]
scope = sys.argv[3]

base_dir = os.getcwd()
output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
os.makedirs(output_root_dir, exist_ok=True)

f_path = os.path.join(base_dir, f_sub_path)
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, "image", output_root_dir)
run_partition_pdf(f_path, strategy, scope, images, "image", output_root_dir)


if __name__ == '__main__':
if len(sys.argv) < 3:
if len(sys.argv) < 4:
print(
"Please provide the path to the file name as the first argument and the strategy as the "
"second argument.",
"Please provide the path to the file name as the first argument, the strategy as the "
"second argument and the scope as the third argument.",
)
sys.exit(1)

if sys.argv[3] not in ["all", "xycut_only"]:
print("Invalid scope")
sys.exit(1)

run()
16 changes: 9 additions & 7 deletions test_unstructured/partition/pdf-image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ def test_partition_image_with_auto_strategy(
elements = image.partition_image(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert titles[0].text == title
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_image_with_table_extraction(
Expand Down Expand Up @@ -240,11 +241,12 @@ def test_partition_image_default_strategy_hi_res():
with open(filename, "rb") as f:
elements = image.partition_image(file=f)

first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert elements[idx].text == title
assert elements[idx].metadata.coordinates is not None
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_image_metadata_date(
Expand Down
19 changes: 9 additions & 10 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ def test_partition_pdf_with_auto_strategy(
):
elements = pdf.partition_pdf(filename=filename, strategy="auto")
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == title
assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[0].metadata.file_directory == "example-docs"
assert elements[7].text == title
assert elements[7].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[7].metadata.file_directory == "example-docs"


def test_partition_pdf_with_page_breaks(
Expand Down Expand Up @@ -388,13 +388,12 @@ def test_partition_pdf_uses_table_extraction():
def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
elements[0] == Title(
"LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
)
# check that the pdf has multiple different page numbers
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 3
assert elements[idx].text == title
assert {element.metadata.page_number for element in elements} == {1, 2}
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_pdf_with_dpi():
Expand Down Expand Up @@ -518,7 +517,7 @@ def test_partition_pdf_with_auto_strategy_exclude_metadata(
include_metadata=False,
)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == title
assert elements[7].text == title
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}

Expand Down
31 changes: 18 additions & 13 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,17 +302,19 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type,
strategy="hi_res",
)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
idx = 3
assert isinstance(elements[idx], Title)
assert elements[idx].text.startswith("LayoutParser")

assert elements[0].metadata.filename == os.path.basename(filename)
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
assert elements[idx].metadata.filename == os.path.basename(filename)
assert elements[idx].metadata.file_directory == os.path.split(filename)[0]

# NOTE(alan): Xfail since new model skips the word Zejiang
request.applymarker(pytest.mark.xfail)

assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen")
idx += 1
assert isinstance(elements[idx], NarrativeText)
assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
Expand Down Expand Up @@ -361,14 +363,16 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
strategy="hi_res",
)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
idx = 3
assert isinstance(elements[idx], Title)
assert elements[idx].text.startswith("LayoutParser")

# NOTE(alan): Xfail since new model misses the first word Zejiang
request.applymarker(pytest.mark.xfail)

assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen")
idx += 1
assert isinstance(elements[idx], NarrativeText)
assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_formats_languages_for_tesseract():
Expand Down Expand Up @@ -425,9 +429,10 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
)

# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert elements[idx].text == title
assert elements[idx].metadata.coordinates is not None


@pytest.mark.parametrize(
Expand Down
60 changes: 60 additions & 0 deletions test_unstructured/partition/utils/test_xycut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import pytest

from unstructured.partition.utils.xycut import (
projection_by_bboxes,
recursive_xy_cut,
recursive_xy_cut_swapped,
split_projection_profile,
)


def test_projection_by_bboxes():
boxes = np.array([[10, 20, 50, 60], [30, 40, 70, 80]])

# Test case 1: Horizontal projection
result_horizontal = projection_by_bboxes(boxes, 0)
expected_result_horizontal = np.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
)
assert np.array_equal(result_horizontal[:30], expected_result_horizontal)

# Test case 2: Vertical projection
result_vertical = projection_by_bboxes(boxes, 1)
expected_result_vertical = np.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
)
assert np.array_equal(result_vertical[:30], expected_result_vertical)


def test_split_projection_profile():
# Test case 1: Sample projection profile with given min_value and min_gap
arr_values = np.array([0, 0, 3, 4, 0, 0, 2, 0, 0, 0, 5, 6, 7, 0, 0, 0])
min_value = 0
min_gap = 1
result = split_projection_profile(arr_values, min_value, min_gap)
expected_result = (np.array([2, 6, 10]), np.array([4, 7, 13]))
assert np.array_equal(result, expected_result)

# Test case 2: Another sample projection profile with different parameters
arr_values = np.array([0, 2, 0, 0, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0])
min_value = 1
min_gap = 2
result = split_projection_profile(arr_values, min_value, min_gap)
expected_result = (np.array([1, 5, 8]), np.array([2, 6, 11]))
assert np.array_equal(result, expected_result)


@pytest.mark.parametrize(
("recursive_func", "expected"),
[
(recursive_xy_cut, [0, 1, 2]),
(recursive_xy_cut_swapped, [0, 2, 1]),
],
)
def test_recursive_xy_cut(recursive_func, expected):
boxes = np.array([[0, 0, 20, 20], [200, 0, 230, 30], [0, 40, 50, 50]])
indices = np.array([0, 1, 2])
res = []
recursive_func(boxes, indices, res)
assert res == expected
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
},
{
"type": "Footer",
"type": "UncategorizedText",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
Expand All @@ -456,7 +456,7 @@
"text": "2"
},
{
"type": "UncategorizedText",
"type": "Footer",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
Expand Down
Loading

0 comments on commit b30d6a6

Please sign in to comment.