Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/1209 tweak xycut ordering output #1630

Merged
merged 15 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

### Fixes

* **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.

## 0.10.19

### Enhancements
Expand Down
29 changes: 18 additions & 11 deletions examples/custom-layout-order/evaluate_xy_cut_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def draw_elements(elements, images, output_type, output_dir, base_name, label):
def run_partition_pdf(
pdf_path,
strategy,
scope,
images,
output_type="plot",
output_root_dir="",
Expand All @@ -113,13 +114,14 @@ def run_partition_pdf(
output_dir = os.path.join(output_root_dir, strategy, f_base_name)
os.makedirs(output_dir, exist_ok=True)

original_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_BASIC,
)
draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")
if scope == "all":
original_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_BASIC,
)
draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")

ordered_elements = partition_pdf(
filename=pdf_path,
Expand All @@ -134,22 +136,27 @@ def run_partition_pdf(
def run():
f_sub_path = sys.argv[1]
strategy = sys.argv[2]
scope = sys.argv[3]

base_dir = os.getcwd()
output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
os.makedirs(output_root_dir, exist_ok=True)

f_path = os.path.join(base_dir, f_sub_path)
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, "image", output_root_dir)
run_partition_pdf(f_path, strategy, scope, images, "image", output_root_dir)


if __name__ == '__main__':
if len(sys.argv) < 3:
if len(sys.argv) < 4:
print(
"Please provide the path to the file name as the first argument and the strategy as the "
"second argument.",
"Please provide the path to the file name as the first argument, the strategy as the "
"second argument and the scope as the third argument.",
)
sys.exit(1)

if sys.argv[3] not in ["all", "xycut_only"]:
print("Invalid scope")
sys.exit(1)

run()
16 changes: 9 additions & 7 deletions test_unstructured/partition/pdf-image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ def test_partition_image_with_auto_strategy(
elements = image.partition_image(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert titles[0].text == title
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_image_with_table_extraction(
Expand Down Expand Up @@ -240,11 +241,12 @@ def test_partition_image_default_strategy_hi_res():
with open(filename, "rb") as f:
elements = image.partition_image(file=f)

first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert elements[idx].text == title
assert elements[idx].metadata.coordinates is not None
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_image_metadata_date(
Expand Down
19 changes: 9 additions & 10 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ def test_partition_pdf_with_auto_strategy(
):
elements = pdf.partition_pdf(filename=filename, strategy="auto")
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == title
assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[0].metadata.file_directory == "example-docs"
assert elements[7].text == title
assert elements[7].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[7].metadata.file_directory == "example-docs"


def test_partition_pdf_with_page_breaks(
Expand Down Expand Up @@ -388,13 +388,12 @@ def test_partition_pdf_uses_table_extraction():
def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
elements[0] == Title(
"LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
)
# check that the pdf has multiple different page numbers
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 3
assert elements[idx].text == title
assert {element.metadata.page_number for element in elements} == {1, 2}
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)


def test_partition_pdf_with_dpi():
Expand Down Expand Up @@ -518,7 +517,7 @@ def test_partition_pdf_with_auto_strategy_exclude_metadata(
include_metadata=False,
)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == title
assert elements[7].text == title
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}

Expand Down
31 changes: 18 additions & 13 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,17 +302,19 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type,
strategy="hi_res",
)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
idx = 3
assert isinstance(elements[idx], Title)
assert elements[idx].text.startswith("LayoutParser")

assert elements[0].metadata.filename == os.path.basename(filename)
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
assert elements[idx].metadata.filename == os.path.basename(filename)
assert elements[idx].metadata.file_directory == os.path.split(filename)[0]

# NOTE(alan): Xfail since new model skips the word Zejiang
request.applymarker(pytest.mark.xfail)

assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen")
idx += 1
assert isinstance(elements[idx], NarrativeText)
assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_pdf_uses_table_extraction():
Expand Down Expand Up @@ -361,14 +363,16 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
strategy="hi_res",
)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
idx = 3
assert isinstance(elements[idx], Title)
assert elements[idx].text.startswith("LayoutParser")

# NOTE(alan): Xfail since new model misses the first word Zejiang
request.applymarker(pytest.mark.xfail)

assert isinstance(elements[1], NarrativeText)
assert elements[1].text.startswith("Zejiang Shen")
idx += 1
assert isinstance(elements[idx], NarrativeText)
assert elements[idx].text.startswith("Zejiang Shen")


def test_auto_partition_formats_languages_for_tesseract():
Expand Down Expand Up @@ -425,9 +429,10 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
)

# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert elements[idx].text == title
assert elements[idx].metadata.coordinates is not None


@pytest.mark.parametrize(
Expand Down
60 changes: 60 additions & 0 deletions test_unstructured/partition/utils/test_xycut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import pytest

from unstructured.partition.utils.xycut import (
projection_by_bboxes,
recursive_xy_cut,
recursive_xy_cut_swapped,
split_projection_profile,
)


def test_projection_by_bboxes():
boxes = np.array([[10, 20, 50, 60], [30, 40, 70, 80]])

# Test case 1: Horizontal projection
result_horizontal = projection_by_bboxes(boxes, 0)
expected_result_horizontal = np.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
)
assert np.array_equal(result_horizontal[:30], expected_result_horizontal)

# Test case 2: Vertical projection
result_vertical = projection_by_bboxes(boxes, 1)
expected_result_vertical = np.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
)
assert np.array_equal(result_vertical[:30], expected_result_vertical)


def test_split_projection_profile():
# Test case 1: Sample projection profile with given min_value and min_gap
arr_values = np.array([0, 0, 3, 4, 0, 0, 2, 0, 0, 0, 5, 6, 7, 0, 0, 0])
min_value = 0
min_gap = 1
result = split_projection_profile(arr_values, min_value, min_gap)
expected_result = (np.array([2, 6, 10]), np.array([4, 7, 13]))
assert np.array_equal(result, expected_result)

# Test case 2: Another sample projection profile with different parameters
arr_values = np.array([0, 2, 0, 0, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0])
min_value = 1
min_gap = 2
result = split_projection_profile(arr_values, min_value, min_gap)
expected_result = (np.array([1, 5, 8]), np.array([2, 6, 11]))
assert np.array_equal(result, expected_result)


@pytest.mark.parametrize(
("recursive_func", "expected"),
[
(recursive_xy_cut, [0, 1, 2]),
(recursive_xy_cut_swapped, [0, 2, 1]),
],
)
def test_recursive_xy_cut(recursive_func, expected):
boxes = np.array([[0, 0, 20, 20], [200, 0, 230, 30], [0, 40, 50, 50]])
indices = np.array([0, 1, 2])
res = []
recursive_func(boxes, indices, res)
assert res == expected
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
},
{
"type": "Footer",
"type": "UncategorizedText",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
Expand All @@ -456,7 +456,7 @@
"text": "2"
},
{
"type": "UncategorizedText",
"type": "Footer",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {
Expand Down
Loading
Loading