Feat/1136 elements ordering for pdf (#1161)

### Summary Address [#1136](#1136) for `hi_res` and `fast` strategies. The `ocr_only` strategy does not include coordinates. - add functionality to switch sort mode between the current `basic` sorting and the new `xy-cut` sorting for `hi_res` and `fast` strategies - add the script to evaluate the `xy-cut` sorting approach - add jupyter notebook to provide evaluation and visualization for the `xy-cut` sorting approach ### Evaluation ``` export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py <file_path> <strategy> ``` Here, the file should be under the project root directory. For example, ``` export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py example-docs/multi-column-2p.pdf fast ```
Unstructured-IO · Aug 25, 2023 · 483b09b · 483b09b
1 parent f267cef
commit 483b09b
Show file tree

Hide file tree

Showing 32 changed files with 2,688 additions and 2,192 deletions.
diff --git a/.gitignore b/.gitignore
@@ -187,4 +187,7 @@ tags
 # Ruff cache
 .ruff_cache/
 
-unstructured-inference/
+unstructured-inference/
+
+example-docs/*_images
+examples/**/output/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@
 ### Enhancements
 * Add threaded Slack conversations into Slack connector output
 
+* Add functionality to sort elements using `xy-cut` sorting approach in `partition_pdf` for `hi_res` and `fast` strategies
+
 ### Features
 
 ### Fixes

diff --git a/example-docs/multi-column-2p.pdf b/example-docs/multi-column-2p.pdf
diff --git a/example-docs/multi-column.pdf b/example-docs/multi-column.pdf
diff --git a/examples/custom-layout-order/README.md b/examples/custom-layout-order/README.md
@@ -0,0 +1,18 @@
+# Custom Layout Sorting
+
+This directory contains examples of how `xy-cut` sorting works.
+
+## Running the example
+
+### Running script(.py)
+
+```
+export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py <file_path> <strategy>
+```
+Here, the file should be under the project root directory. For example,
+```
+export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py example-docs/multi-column-2p.pdf fast
+```
+
+### Running jupyter notebook
+The Google Colab version of the notebook can be found here: `<Unstructured colab Gdrive>/evaluate_xy_cut_sorting.ipynb`
diff --git a/examples/custom-layout-order/evaluate_xy_cut_sorting.py b/examples/custom-layout-order/evaluate_xy_cut_sorting.py
@@ -0,0 +1,155 @@
+import os
+import sys
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import pdf2image
+
+from unstructured.documents.elements import PageBreak
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
+from unstructured.partition.utils.xycut import (
+    bbox2points,
+    recursive_xy_cut,
+    vis_polygons_with_index,
+)
+
+
+def show_plot(image, desired_width=None):
+    image_height, image_width, _ = image.shape
+    if desired_width:
+        # Calculate the desired height based on the original aspect ratio
+        aspect_ratio = image_width / image_height
+        desired_height = desired_width / aspect_ratio
+
+        # Create a figure with the desired size and aspect ratio
+        fig, ax = plt.subplots(figsize=(desired_width, desired_height))
+    else:
+        # Create figure and axes
+        fig, ax = plt.subplots()
+    # Display the image
+    ax.imshow(image)
+    plt.show()
+
+
+def extract_element_coordinates(elements):
+    elements_coordinates = []
+    page_elements_coordinates = []
+
+    for el in elements:
+        if isinstance(el, PageBreak):
+            if page_elements_coordinates:
+                elements_coordinates.append(page_elements_coordinates)
+                page_elements_coordinates = []
+        else:
+            page_elements_coordinates.append(el.metadata.coordinates)
+
+    if page_elements_coordinates:
+        elements_coordinates.append(page_elements_coordinates)
+
+    return elements_coordinates
+
+
+def convert_coordinates_to_boxes(coordinates, image):
+    boxes = []
+
+    for coordinate in coordinates:
+        points = coordinate.points
+        _left, _top = points[0]
+        _right, _bottom = points[2]
+        w = coordinate.system.width
+        h = coordinate.system.height
+        image_height, image_width, _ = image.shape
+        left = _left * image_width / w
+        right = _right * image_width / w
+        top = _top * image_height / h
+        bottom = _bottom * image_height / h
+        boxes.append([int(left), int(top), int(right), int(bottom)])
+
+    return boxes
+
+
+def order_boxes(boxes):
+    res = []
+    recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
+    np_array_boxes = np.array(boxes)
+    ordered_boxes = np_array_boxes[np.array(res)].tolist()
+    return ordered_boxes
+
+
+def draw_boxes(image, boxes, output_dir, base_name, page_num, output_type, label):
+    annotated_image = vis_polygons_with_index(image, [bbox2points(it) for it in boxes])
+
+    if output_type in ["plot", "all"]:
+        print(f"{label} elements - Page: {page_num}")
+        show_plot(annotated_image, desired_width=20)
+
+    if output_type in ["image", "all"]:
+        output_image_path = os.path.join(output_dir, f"{base_name}_{page_num}_{label}.jpg")
+        cv2.imwrite(output_image_path, annotated_image)
+
+
+def draw_elements(elements, images, output_type, output_dir, base_name, label):
+    elements_coordinates = extract_element_coordinates(elements)
+
+    assert len(images) == len(elements_coordinates)
+    for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
+        image = np.array(img)
+        boxes = convert_coordinates_to_boxes(coords_per_page, image)
+        draw_boxes(image, boxes, output_dir, base_name, idx + 1, output_type, label)
+
+
+def run_partition_pdf(
+    pdf_path,
+    strategy,
+    images,
+    output_type="plot",
+    output_root_dir="",
+):
+    print(f">>> Starting run_partition_pdf - f_path: {pdf_path} - strategy: {strategy}")
+    f_base_name = os.path.splitext(os.path.basename(pdf_path))[0]
+
+    output_dir = os.path.join(output_root_dir, strategy, f_base_name)
+    os.makedirs(output_dir, exist_ok=True)
+
+    original_elements = partition_pdf(
+        filename=pdf_path,
+        strategy=strategy,
+        include_page_breaks=True,
+        sort_mode=SORT_MODE_BASIC,
+    )
+    draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")
+
+    ordered_elements = partition_pdf(
+        filename=pdf_path,
+        strategy=strategy,
+        include_page_breaks=True,
+        sort_mode=SORT_MODE_XY_CUT,
+    )
+    draw_elements(ordered_elements, images, output_type, output_dir, f_base_name, "result")
+    print("<<< Finished run_partition_pdf")
+
+
+def run():
+    f_sub_path = sys.argv[1]
+    strategy = sys.argv[2]
+
+    base_dir = os.getcwd()
+    output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
+    os.makedirs(output_root_dir, exist_ok=True)
+
+    f_path = os.path.join(base_dir, f_sub_path)
+    images = pdf2image.convert_from_path(f_path)
+    run_partition_pdf(f_path, strategy, images, "image", output_root_dir)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print(
+            "Please provide the path to the file name as the first argument and the strategy as the "
+            "second argument.",
+        )
+        sys.exit(1)
+
+    run()
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -4,19 +4,14 @@
 
 import magic
 import pytest
-from PIL import Image
-from unstructured_inference.inference import layout
-from unstructured_inference.inference.layoutelement import LocationlessLayoutElement
 
 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
     FileType,
-    _get_page_image_metadata,
     _is_code_mime_type,
     _is_text_file_a_csv,
     _is_text_file_a_json,
     detect_filetype,
-    document_to_element_list,
 )
 
 FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
@@ -31,29 +26,6 @@
 ]
 
 
-class MockPageLayout(layout.PageLayout):
-    def __init__(self, number: int, image: Image):
-        self.number = number
-        self.image = image
-
-    @property
-    def elements(self):
-        return [
-            LocationlessLayoutElement(
-                type="Headline",
-                text="Charlie Brown and the Great Pumpkin",
-            ),
-        ]
-
-
-class MockDocumentLayout(layout.DocumentLayout):
-    @property
-    def pages(self):
-        return [
-            MockPageLayout(number=1, image=Image.new("1", (1, 1))),
-        ]
-
-
 @pytest.mark.parametrize(
     ("file", "expected"),
     [
@@ -467,15 +439,3 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
 
     with open(filename, "rb") as f:
         assert detect_filetype(file=f) == FileType.CSV
-
-
-def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
-    layout_elem_absent_coordinates = MockDocumentLayout()
-    elements = document_to_element_list(layout_elem_absent_coordinates)
-    assert elements[0].metadata.coordinates is None
-
-
-def test_get_page_image_metadata_and_coordinate_system():
-    doc = MockDocumentLayout()
-    metadata = _get_page_image_metadata(doc.pages[0])
-    assert isinstance(metadata, dict)
diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -470,10 +470,10 @@ def test_partition_pdf_fast_groups_text_in_text_box():
     assert str(elements[1]).endswith("Jordan and Egypt.")
 
     expected_coordinate_points_3 = (
-        (273.9929, 181.16470000000004),
-        (273.9929, 226.16470000000004),
-        (333.59990000000005, 226.16470000000004),
-        (333.59990000000005, 181.16470000000004),
+        (95.6683, 181.16470000000004),
+        (95.6683, 226.16470000000004),
+        (166.7908, 226.16470000000004),
+        (166.7908, 181.16470000000004),
     )
     expected_coordinate_system_3 = PixelSpace(width=612, height=792)
     expected_elem_metadata_3 = ElementMetadata(
@@ -482,7 +482,7 @@ def test_partition_pdf_fast_groups_text_in_text_box():
             system=expected_coordinate_system_3,
         ),
     )
-    assert elements[3] == Title("1st", metadata=expected_elem_metadata_3)
+    assert elements[3] == Text("2.5", metadata=expected_elem_metadata_3)
 
 
 def test_partition_pdf_with_metadata_filename(

diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
@@ -1,5 +1,8 @@
 import pytest
+from PIL import Image
+from unstructured_inference.inference import layout
 from unstructured_inference.inference.layout import LayoutElement
+from unstructured_inference.inference.layoutelement import LocationlessLayoutElement
 
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import (
@@ -11,7 +14,34 @@
     Title,
 )
 from unstructured.partition import common
-from unstructured.partition.common import contains_emoji
+from unstructured.partition.common import (
+    _get_page_image_metadata,
+    contains_emoji,
+    document_to_element_list,
+)
+
+
+class MockPageLayout(layout.PageLayout):
+    def __init__(self, number: int, image: Image):
+        self.number = number
+        self.image = image
+
+    @property
+    def elements(self):
+        return [
+            LocationlessLayoutElement(
+                type="Headline",
+                text="Charlie Brown and the Great Pumpkin",
+            ),
+        ]
+
+
+class MockDocumentLayout(layout.DocumentLayout):
+    @property
+    def pages(self):
+        return [
+            MockPageLayout(number=1, image=Image.new("1", (1, 1))),
+        ]
 
 
 def test_normalize_layout_element_dict():
@@ -243,3 +273,15 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
 )
 def test_contains_emoji(text, expected):
     assert contains_emoji(text) is expected
+
+
+def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
+    layout_elem_absent_coordinates = MockDocumentLayout()
+    elements = document_to_element_list(layout_elem_absent_coordinates)
+    assert elements[0].metadata.coordinates is None
+
+
+def test_get_page_image_metadata_and_coordinate_system():
+    doc = MockDocumentLayout()
+    metadata = _get_page_image_metadata(doc.pages[0])
+    assert isinstance(metadata, dict)