Fix/1209 tweak xycut ordering output (#1630)

Closes GH Issue #1209. ### Summary - add swapped `xycut` sorting - update `xycut` sorting evaluation script PDFs: - [sbaa031.073.pdf](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7234218/pdf/sbaa031.073.pdf) - [multi-column-2p.pdf](https://github.com/Unstructured-IO/unstructured/files/12796147/multi-column-2p.pdf) - [11723901.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12360085/11723901.pdf) ### Testing ``` elements = partition_pdf("sbaa031.073.pdf", strategy="hi_res") print("\n\n".join([str(el) for el in elements])) ``` ### Evaluation ``` PYTHONPATH=. python examples/custom-layout-order/evaluate_xy_cut_sorting.py sbaa031.073.pdf hi_res xycut_only ```
Unstructured-IO · Oct 5, 2023 · b30d6a6 · b30d6a6
1 parent 6d8572d
commit b30d6a6
Show file tree

Hide file tree

Showing 24 changed files with 1,864 additions and 1,715 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 ### Fixes
 
+* **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.
+
 ## 0.10.19
 
 ### Enhancements

diff --git a/examples/custom-layout-order/evaluate_xy_cut_sorting.py b/examples/custom-layout-order/evaluate_xy_cut_sorting.py
@@ -103,6 +103,7 @@ def draw_elements(elements, images, output_type, output_dir, base_name, label):
 def run_partition_pdf(
     pdf_path,
     strategy,
+    scope,
     images,
     output_type="plot",
     output_root_dir="",
@@ -113,13 +114,14 @@ def run_partition_pdf(
     output_dir = os.path.join(output_root_dir, strategy, f_base_name)
     os.makedirs(output_dir, exist_ok=True)
 
-    original_elements = partition_pdf(
-        filename=pdf_path,
-        strategy=strategy,
-        include_page_breaks=True,
-        sort_mode=SORT_MODE_BASIC,
-    )
-    draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")
+    if scope == "all":
+        original_elements = partition_pdf(
+            filename=pdf_path,
+            strategy=strategy,
+            include_page_breaks=True,
+            sort_mode=SORT_MODE_BASIC,
+        )
+        draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")
 
     ordered_elements = partition_pdf(
         filename=pdf_path,
@@ -134,22 +136,27 @@ def run_partition_pdf(
 def run():
     f_sub_path = sys.argv[1]
     strategy = sys.argv[2]
+    scope = sys.argv[3]
 
     base_dir = os.getcwd()
     output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
     os.makedirs(output_root_dir, exist_ok=True)
 
     f_path = os.path.join(base_dir, f_sub_path)
     images = pdf2image.convert_from_path(f_path)
-    run_partition_pdf(f_path, strategy, images, "image", output_root_dir)
+    run_partition_pdf(f_path, strategy, scope, images, "image", output_root_dir)
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 3:
+    if len(sys.argv) < 4:
         print(
-            "Please provide the path to the file name as the first argument and the strategy as the "
-            "second argument.",
+            "Please provide the path to the file name as the first argument, the strategy as the "
+            "second argument and the scope as the third argument.",
         )
         sys.exit(1)
 
+    if sys.argv[3] not in ["all", "xycut_only"]:
+        print("Invalid scope")
+        sys.exit(1)
+
     run()
diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
@@ -117,9 +117,10 @@ def test_partition_image_with_auto_strategy(
     elements = image.partition_image(filename=filename, strategy="auto")
     titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
     title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    idx = 2
     assert titles[0].text == title
-    assert elements[0].metadata.detection_class_prob is not None
-    assert isinstance(elements[0].metadata.detection_class_prob, float)
+    assert elements[idx].metadata.detection_class_prob is not None
+    assert isinstance(elements[idx].metadata.detection_class_prob, float)
 
 
 def test_partition_image_with_table_extraction(
@@ -240,11 +241,12 @@ def test_partition_image_default_strategy_hi_res():
     with open(filename, "rb") as f:
         elements = image.partition_image(file=f)
 
-    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
-    assert elements[0].text == first_line
-    assert elements[0].metadata.coordinates is not None
-    assert elements[0].metadata.detection_class_prob is not None
-    assert isinstance(elements[0].metadata.detection_class_prob, float)
+    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    idx = 2
+    assert elements[idx].text == title
+    assert elements[idx].metadata.coordinates is not None
+    assert elements[idx].metadata.detection_class_prob is not None
+    assert isinstance(elements[idx].metadata.detection_class_prob, float)
 
 
 def test_partition_image_metadata_date(

diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -196,9 +196,9 @@ def test_partition_pdf_with_auto_strategy(
 ):
     elements = pdf.partition_pdf(filename=filename, strategy="auto")
     title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    assert elements[0].text == title
-    assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
-    assert elements[0].metadata.file_directory == "example-docs"
+    assert elements[7].text == title
+    assert elements[7].metadata.filename == "layout-parser-paper-fast.pdf"
+    assert elements[7].metadata.file_directory == "example-docs"
 
 
 def test_partition_pdf_with_page_breaks(
@@ -388,13 +388,12 @@ def test_partition_pdf_uses_table_extraction():
 def test_partition_pdf_with_copy_protection():
     filename = os.path.join("example-docs", "copy-protected.pdf")
     elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
-    elements[0] == Title(
-        "LayoutParser: A Uniﬁed Toolkit for Deep Based Document Image Analysis",
-    )
-    # check that the pdf has multiple different page numbers
+    title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
+    idx = 3
+    assert elements[idx].text == title
     assert {element.metadata.page_number for element in elements} == {1, 2}
-    assert elements[0].metadata.detection_class_prob is not None
-    assert isinstance(elements[0].metadata.detection_class_prob, float)
+    assert elements[idx].metadata.detection_class_prob is not None
+    assert isinstance(elements[idx].metadata.detection_class_prob, float)
 
 
 def test_partition_pdf_with_dpi():
@@ -518,7 +517,7 @@ def test_partition_pdf_with_auto_strategy_exclude_metadata(
         include_metadata=False,
     )
     title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    assert elements[0].text == title
+    assert elements[7].text == title
     for i in range(len(elements)):
         assert elements[i].metadata.to_dict() == {}
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -302,17 +302,19 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename, content_type,
         strategy="hi_res",
     )
 
-    assert isinstance(elements[0], Title)
-    assert elements[0].text.startswith("LayoutParser")
+    idx = 3
+    assert isinstance(elements[idx], Title)
+    assert elements[idx].text.startswith("LayoutParser")
 
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[idx].metadata.filename == os.path.basename(filename)
+    assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
 
     # NOTE(alan): Xfail since new model skips the word Zejiang
     request.applymarker(pytest.mark.xfail)
 
-    assert isinstance(elements[1], NarrativeText)
-    assert elements[1].text.startswith("Zejiang Shen")
+    idx += 1
+    assert isinstance(elements[idx], NarrativeText)
+    assert elements[idx].text.startswith("Zejiang Shen")
 
 
 def test_auto_partition_pdf_uses_table_extraction():
@@ -361,14 +363,16 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename, content_type, requ
             strategy="hi_res",
         )
 
-    assert isinstance(elements[0], Title)
-    assert elements[0].text.startswith("LayoutParser")
+    idx = 3
+    assert isinstance(elements[idx], Title)
+    assert elements[idx].text.startswith("LayoutParser")
 
     # NOTE(alan): Xfail since new model misses the first word Zejiang
     request.applymarker(pytest.mark.xfail)
 
-    assert isinstance(elements[1], NarrativeText)
-    assert elements[1].text.startswith("Zejiang Shen")
+    idx += 1
+    assert isinstance(elements[idx], NarrativeText)
+    assert elements[idx].text.startswith("Zejiang Shen")
 
 
 def test_auto_partition_formats_languages_for_tesseract():
@@ -425,9 +429,10 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
     )
 
     # should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
-    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
-    assert elements[0].text == first_line
-    assert elements[0].metadata.coordinates is not None
+    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    idx = 2
+    assert elements[idx].text == title
+    assert elements[idx].metadata.coordinates is not None
 
 
 @pytest.mark.parametrize(

diff --git a/test_unstructured/partition/utils/test_xycut.py b/test_unstructured/partition/utils/test_xycut.py
@@ -0,0 +1,60 @@
+import numpy as np
+import pytest
+
+from unstructured.partition.utils.xycut import (
+    projection_by_bboxes,
+    recursive_xy_cut,
+    recursive_xy_cut_swapped,
+    split_projection_profile,
+)
+
+
+def test_projection_by_bboxes():
+    boxes = np.array([[10, 20, 50, 60], [30, 40, 70, 80]])
+
+    # Test case 1: Horizontal projection
+    result_horizontal = projection_by_bboxes(boxes, 0)
+    expected_result_horizontal = np.array(
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    )
+    assert np.array_equal(result_horizontal[:30], expected_result_horizontal)
+
+    # Test case 2: Vertical projection
+    result_vertical = projection_by_bboxes(boxes, 1)
+    expected_result_vertical = np.array(
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    )
+    assert np.array_equal(result_vertical[:30], expected_result_vertical)
+
+
+def test_split_projection_profile():
+    # Test case 1: Sample projection profile with given min_value and min_gap
+    arr_values = np.array([0, 0, 3, 4, 0, 0, 2, 0, 0, 0, 5, 6, 7, 0, 0, 0])
+    min_value = 0
+    min_gap = 1
+    result = split_projection_profile(arr_values, min_value, min_gap)
+    expected_result = (np.array([2, 6, 10]), np.array([4, 7, 13]))
+    assert np.array_equal(result, expected_result)
+
+    # Test case 2: Another sample projection profile with different parameters
+    arr_values = np.array([0, 2, 0, 0, 0, 3, 0, 0, 4, 5, 6, 0, 0, 0])
+    min_value = 1
+    min_gap = 2
+    result = split_projection_profile(arr_values, min_value, min_gap)
+    expected_result = (np.array([1, 5, 8]), np.array([2, 6, 11]))
+    assert np.array_equal(result, expected_result)
+
+
+@pytest.mark.parametrize(
+    ("recursive_func", "expected"),
+    [
+        (recursive_xy_cut, [0, 1, 2]),
+        (recursive_xy_cut_swapped, [0, 2, 1]),
+    ],
+)
+def test_recursive_xy_cut(recursive_func, expected):
+    boxes = np.array([[0, 0, 20, 20], [200, 0, 230, 30], [0, 40, 50, 50]])
+    indices = np.array([0, 1, 2])
+    res = []
+    recursive_func(boxes, indices, res)
+    assert res == expected
diff --git a/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/...ected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -437,7 +437,7 @@
     "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
   },
   {
-    "type": "Footer",
+    "type": "UncategorizedText",
     "element_id": "d4735e3a265e16eee03f59718b9b5d03",
     "metadata": {
       "data_source": {
@@ -456,7 +456,7 @@
     "text": "2"
   },
   {
-    "type": "UncategorizedText",
+    "type": "Footer",
     "element_id": "d4735e3a265e16eee03f59718b9b5d03",
     "metadata": {
       "data_source": {