Skip to content

Commit

Permalink
Feat/1136 elements ordering for pdf (#1161)
Browse files Browse the repository at this point in the history
### Summary
Address
[#1136](#1136) for
`hi_res` and `fast` strategies. The `ocr_only` strategy does not include
coordinates.
- add functionality to switch sort mode between the current `basic`
sorting and the new `xy-cut` sorting for `hi_res` and `fast` strategies
- add the script to evaluate the `xy-cut` sorting approach
- add jupyter notebook to provide evaluation and visualization for the
`xy-cut` sorting approach

### Evaluation
```
export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py <file_path> <strategy>
```
Here, the file should be under the project root directory. For example,
```
export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py example-docs/multi-column-2p.pdf fast
```
  • Loading branch information
christinestraub authored Aug 25, 2023
1 parent f267cef commit 483b09b
Show file tree
Hide file tree
Showing 32 changed files with 2,688 additions and 2,192 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,7 @@ tags
# Ruff cache
.ruff_cache/

unstructured-inference/
unstructured-inference/

example-docs/*_images
examples/**/output/
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
### Enhancements
* Add threaded Slack conversations into Slack connector output

* Add functionality to sort elements using `xy-cut` sorting approach in `partition_pdf` for `hi_res` and `fast` strategies

### Features

### Fixes
Expand Down
Binary file added example-docs/multi-column-2p.pdf
Binary file not shown.
Binary file added example-docs/multi-column.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions examples/custom-layout-order/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Custom Layout Sorting

This directory contains examples of how `xy-cut` sorting works.

## Running the example

### Running script(.py)

```
export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py <file_path> <strategy>
```
Here, the file should be under the project root directory. For example,
```
export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_xy_cut_sorting.py example-docs/multi-column-2p.pdf fast
```

### Running jupyter notebook
The Google Colab version of the notebook can be found here: `<Unstructured colab Gdrive>/evaluate_xy_cut_sorting.ipynb`
155 changes: 155 additions & 0 deletions examples/custom-layout-order/evaluate_xy_cut_sorting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import os
import sys

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pdf2image

from unstructured.documents.elements import PageBreak
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import (
bbox2points,
recursive_xy_cut,
vis_polygons_with_index,
)


def show_plot(image, desired_width=None):
image_height, image_width, _ = image.shape
if desired_width:
# Calculate the desired height based on the original aspect ratio
aspect_ratio = image_width / image_height
desired_height = desired_width / aspect_ratio

# Create a figure with the desired size and aspect ratio
fig, ax = plt.subplots(figsize=(desired_width, desired_height))
else:
# Create figure and axes
fig, ax = plt.subplots()
# Display the image
ax.imshow(image)
plt.show()


def extract_element_coordinates(elements):
elements_coordinates = []
page_elements_coordinates = []

for el in elements:
if isinstance(el, PageBreak):
if page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)
page_elements_coordinates = []
else:
page_elements_coordinates.append(el.metadata.coordinates)

if page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)

return elements_coordinates


def convert_coordinates_to_boxes(coordinates, image):
boxes = []

for coordinate in coordinates:
points = coordinate.points
_left, _top = points[0]
_right, _bottom = points[2]
w = coordinate.system.width
h = coordinate.system.height
image_height, image_width, _ = image.shape
left = _left * image_width / w
right = _right * image_width / w
top = _top * image_height / h
bottom = _bottom * image_height / h
boxes.append([int(left), int(top), int(right), int(bottom)])

return boxes


def order_boxes(boxes):
res = []
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
np_array_boxes = np.array(boxes)
ordered_boxes = np_array_boxes[np.array(res)].tolist()
return ordered_boxes


def draw_boxes(image, boxes, output_dir, base_name, page_num, output_type, label):
annotated_image = vis_polygons_with_index(image, [bbox2points(it) for it in boxes])

if output_type in ["plot", "all"]:
print(f"{label} elements - Page: {page_num}")
show_plot(annotated_image, desired_width=20)

if output_type in ["image", "all"]:
output_image_path = os.path.join(output_dir, f"{base_name}_{page_num}_{label}.jpg")
cv2.imwrite(output_image_path, annotated_image)


def draw_elements(elements, images, output_type, output_dir, base_name, label):
elements_coordinates = extract_element_coordinates(elements)

assert len(images) == len(elements_coordinates)
for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
image = np.array(img)
boxes = convert_coordinates_to_boxes(coords_per_page, image)
draw_boxes(image, boxes, output_dir, base_name, idx + 1, output_type, label)


def run_partition_pdf(
pdf_path,
strategy,
images,
output_type="plot",
output_root_dir="",
):
print(f">>> Starting run_partition_pdf - f_path: {pdf_path} - strategy: {strategy}")
f_base_name = os.path.splitext(os.path.basename(pdf_path))[0]

output_dir = os.path.join(output_root_dir, strategy, f_base_name)
os.makedirs(output_dir, exist_ok=True)

original_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_BASIC,
)
draw_elements(original_elements, images, output_type, output_dir, f_base_name, "original")

ordered_elements = partition_pdf(
filename=pdf_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=SORT_MODE_XY_CUT,
)
draw_elements(ordered_elements, images, output_type, output_dir, f_base_name, "result")
print("<<< Finished run_partition_pdf")


def run():
f_sub_path = sys.argv[1]
strategy = sys.argv[2]

base_dir = os.getcwd()
output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
os.makedirs(output_root_dir, exist_ok=True)

f_path = os.path.join(base_dir, f_sub_path)
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, "image", output_root_dir)


if __name__ == '__main__':
if len(sys.argv) < 3:
print(
"Please provide the path to the file name as the first argument and the strategy as the "
"second argument.",
)
sys.exit(1)

run()
40 changes: 0 additions & 40 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,14 @@

import magic
import pytest
from PIL import Image
from unstructured_inference.inference import layout
from unstructured_inference.inference.layoutelement import LocationlessLayoutElement

from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
FileType,
_get_page_image_metadata,
_is_code_mime_type,
_is_text_file_a_csv,
_is_text_file_a_json,
detect_filetype,
document_to_element_list,
)

FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand All @@ -31,29 +26,6 @@
]


class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image):
self.number = number
self.image = image

@property
def elements(self):
return [
LocationlessLayoutElement(
type="Headline",
text="Charlie Brown and the Great Pumpkin",
),
]


class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
]


@pytest.mark.parametrize(
("file", "expected"),
[
Expand Down Expand Up @@ -467,15 +439,3 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):

with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.CSV


def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockDocumentLayout()
elements = document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None


def test_get_page_image_metadata_and_coordinate_system():
doc = MockDocumentLayout()
metadata = _get_page_image_metadata(doc.pages[0])
assert isinstance(metadata, dict)
10 changes: 5 additions & 5 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,10 +470,10 @@ def test_partition_pdf_fast_groups_text_in_text_box():
assert str(elements[1]).endswith("Jordan and Egypt.")

expected_coordinate_points_3 = (
(273.9929, 181.16470000000004),
(273.9929, 226.16470000000004),
(333.59990000000005, 226.16470000000004),
(333.59990000000005, 181.16470000000004),
(95.6683, 181.16470000000004),
(95.6683, 226.16470000000004),
(166.7908, 226.16470000000004),
(166.7908, 181.16470000000004),
)
expected_coordinate_system_3 = PixelSpace(width=612, height=792)
expected_elem_metadata_3 = ElementMetadata(
Expand All @@ -482,7 +482,7 @@ def test_partition_pdf_fast_groups_text_in_text_box():
system=expected_coordinate_system_3,
),
)
assert elements[3] == Title("1st", metadata=expected_elem_metadata_3)
assert elements[3] == Text("2.5", metadata=expected_elem_metadata_3)


def test_partition_pdf_with_metadata_filename(
Expand Down
44 changes: 43 additions & 1 deletion test_unstructured/partition/test_common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pytest
from PIL import Image
from unstructured_inference.inference import layout
from unstructured_inference.inference.layout import LayoutElement
from unstructured_inference.inference.layoutelement import LocationlessLayoutElement

from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
Expand All @@ -11,7 +14,34 @@
Title,
)
from unstructured.partition import common
from unstructured.partition.common import contains_emoji
from unstructured.partition.common import (
_get_page_image_metadata,
contains_emoji,
document_to_element_list,
)


class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image):
self.number = number
self.image = image

@property
def elements(self):
return [
LocationlessLayoutElement(
type="Headline",
text="Charlie Brown and the Great Pumpkin",
),
]


class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
]


def test_normalize_layout_element_dict():
Expand Down Expand Up @@ -243,3 +273,15 @@ def test_convert_ms_office_table_to_text_works_with_empty_tables():
)
def test_contains_emoji(text, expected):
assert contains_emoji(text) is expected


def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockDocumentLayout()
elements = document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None


def test_get_page_image_metadata_and_coordinate_system():
doc = MockDocumentLayout()
metadata = _get_page_image_metadata(doc.pages[0])
assert isinstance(metadata, dict)
Loading

0 comments on commit 483b09b

Please sign in to comment.