-
Notifications
You must be signed in to change notification settings - Fork 743
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix/1209 tweak xycut ordering output (#1630)
Closes GH Issue #1209. ### Summary - add swapped `xycut` sorting - update `xycut` sorting evaluation script PDFs: - [sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf) - [multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf) - [11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf) ### Testing ``` elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res") print("\n\n".join([str(el) for el in elements])) ``` ### Evaluation ``` PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only ```
- Loading branch information
1 parent
6d8572d
commit b30d6a6
Showing
24 changed files
with
1,864 additions
and
1,715 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from unstructured.partition.utils.xycut import ( | ||
projection_by_bboxes, | ||
recursive_xy_cut, | ||
recursive_xy_cut_swapped, | ||
split_projection_profile, | ||
) | ||
|
||
|
||
def test_projection_by_bboxes(): | ||
boxes = np.array([[10, 20, 50, 60], [30, 40, 70, 80]]) | ||
|
||
# Test case 1: Horizontal projection | ||
result_horizontal = projection_by_bboxes(boxes, 0) | ||
expected_result_horizontal = np.array( | ||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], | ||
) | ||
assert np.array_equal(result_horizontal[:30], expected_result_horizontal) | ||
|
||
# Test case 2: Vertical projection | ||
result_vertical = projection_by_bboxes(boxes, 1) | ||
expected_result_vertical = np.array( | ||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], | ||
) | ||
assert np.array_equal(result_vertical[:30], expected_result_vertical) | ||
|
||
|
||
def test_split_projection_profile(): | ||
# Test case 1: Sample projection profile with given min_value and min_gap | ||
arr_values = np.array([0, 0, 3, 4, 0, 0, 2, 0, 0, 0, 5, 6, 7, 0, 0, 0]) | ||
min_value = 0 | ||
min_gap = 1 | ||
result = split_projection_profile(arr_values, min_value, min_gap) | ||
expected_result = (np.array([2, 6, 10]), np.array([4, 7, 13])) | ||
assert np.array_equal(result, expected_result) | ||
|
||
# Test case 2: Another sample projection profile with different parameters | ||
arr_values = np.array([0, 2, 0, 0, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0]) | ||
min_value = 1 | ||
min_gap = 2 | ||
result = split_projection_profile(arr_values, min_value, min_gap) | ||
expected_result = (np.array([1, 5, 8]), np.array([2, 6, 11])) | ||
assert np.array_equal(result, expected_result) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("recursive_func", "expected"), | ||
[ | ||
(recursive_xy_cut, [0, 1, 2]), | ||
(recursive_xy_cut_swapped, [0, 2, 1]), | ||
], | ||
) | ||
def test_recursive_xy_cut(recursive_func, expected): | ||
boxes = np.array([[0, 0, 20, 20], [200, 0, 230, 30], [0, 40, 50, 50]]) | ||
indices = np.array([0, 1, 2]) | ||
res = [] | ||
recursive_func(boxes, indices, res) | ||
assert res == expected |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.