Skip to content

Commit

Permalink
Extract coordinates from PDFs and images when using OCR only strategy (
Browse files Browse the repository at this point in the history
…#1163)

### Summary
Closes #983 
Creates new function `add_pytesseract_bbox_to_elements`
Fixes typos in docstrings

### Testing
```
from unstructured.partition.image import partition_image
from PIL import Image, ImageDraw

png_filename="example-docs/english-and-korean.png"
png_elements = partition_image(filename=png_filename, strategy="ocr_only")
png_image = Image.open(png_filename)
draw = ImageDraw.Draw(png_image)
draw.polygon(png_elements[0].metadata.coordinates.points, outline="red", width=2)
draw.polygon(png_elements[1].metadata.coordinates.points, outline="red", width=2)
draw.polygon(png_elements[2].metadata.coordinates.points, outline="red", width=2)
output = "example-docs/english-and-korean-box.png"
png_image.save(output)
png_image.close()
```
  • Loading branch information
Coniferish authored Aug 25, 2023
1 parent c578b85 commit 5872fa2
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

### Features

* Extract coordinates from PDFs and images when using OCR only strategy and add to metadata

### Fixes

* Update `partition_html` to respect the order of `<pre>` tags.
Expand Down
19 changes: 16 additions & 3 deletions test_unstructured/partition/pdf-image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pytesseract import TesseractError
from unstructured_inference.inference import layout

from unstructured.documents.elements import Title
from unstructured.partition import image, pdf

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -194,7 +193,7 @@ def test_partition_image_with_ocr_detects_korean():
strategy="ocr_only",
)

assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")


Expand All @@ -207,7 +206,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
strategy="ocr_only",
)

assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")


Expand Down Expand Up @@ -378,3 +377,17 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
)

assert elements[0].metadata.last_modified == expected_last_modification_date


def test_partition_image_with_ocr_has_coordinates_from_file(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename, strategy="ocr_only")
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]
28 changes: 28 additions & 0 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,31 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
)

assert elements[0].metadata.last_modified == expected_last_modification_date


def test_partition_pdf_with_ocr_has_coordinates_from_filename(
filename="example-docs/chevron-page.pdf",
):
elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
assert elements[0].metadata.coordinates.points == [
(657.0, 2144.0),
(657.0, 2106.0),
(1043.0, 2106.0),
(1043.0, 2144.0),
]


def test_partition_pdf_with_ocr_has_coordinates_from_file(
filename="example-docs/chevron-page.pdf",
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
strategy="ocr_only",
)
assert elements[0].metadata.coordinates.points == [
(657.0, 2144.0),
(657.0, 2106.0),
(1043.0, 2106.0),
(1043.0, 2144.0),
]
8 changes: 4 additions & 4 deletions unstructured/documents/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def __init__(self):


class PixelSpace(CoordinateSystem):
"""Coordinate system representing a pixel space, such as an image. The origin is at the bottom
right."""
"""Coordinate system representing a pixel space, such as an image. The origin is at the top
left."""

orientation = Orientation.SCREEN


class PointSpace(CoordinateSystem):
"""Coordinate system representing a point space, such as a pdf. The origin is at the top
right."""
"""Coordinate system representing a point space, such as a pdf. The origin is at the bottom
left."""

orientation = Orientation.CARTESIAN

Expand Down
73 changes: 71 additions & 2 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pdfminer.utils import open_filename

from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.coordinates import PixelSpace, PointSpace
from unstructured.documents.elements import (
CoordinatesMetadata,
Element,
Expand Down Expand Up @@ -460,6 +460,66 @@ def convert_pdf_to_images(
yield image


def add_pytesseract_bbox_to_elements(elements, bboxes, width, height):
"""
Get the bounding box of each element and add it to element.metadata.coordinates
Args:
elements: elements containing text detected by pytesseract.image_to_string.
bboxes (str): The return value of pytesseract.image_to_boxes.
"""
# (NOTE) jennings: This function was written with pytesseract in mind, but
# paddle returns similar values via `ocr.ocr(img)`.
# See more at issue #1176: https://github.com/Unstructured-IO/unstructured/issues/1176
min_x = float("inf")
min_y = float("inf")
max_x = 0
max_y = 0
point_space = PointSpace(
width=width,
height=height,
)
pixel_space = PixelSpace(
width=width,
height=height,
)

boxes = bboxes.strip().split("\n")
i = 0
for element in elements:
char_count = len(element.text.replace(" ", ""))

for box in boxes[i : i + char_count]: # noqa
_, x1, y1, x2, y2, _ = box.split()
x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

min_x = min(min_x, x1)
min_y = min(min_y, y1)
max_x = max(max_x, x2)
max_y = max(max_y, y2)

points = ((min_x, min_y), (min_x, max_y), (max_x, max_y), (max_x, min_y))
converted_points = []
for point in points:
x, y = point
new_x, new_y = point_space.convert_coordinates_to_new_system(pixel_space, x, y)
converted_points.append((new_x, new_y))

element.metadata.coordinates = CoordinatesMetadata(
points=converted_points,
system=pixel_space,
)

# reset for next element
min_x = float("inf")
min_y = float("inf")
max_x = 0
max_y = 0
i += char_count

return elements


@requires_dependencies("pytesseract")
def _partition_pdf_or_image_with_ocr(
filename: str = "",
Expand All @@ -471,22 +531,27 @@ def _partition_pdf_or_image_with_ocr(
min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None,
):
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
"""Partitions an image or PDF using Tesseract OCR. For PDFs, each page is converted
to an image prior to processing."""
import pytesseract

if is_image:
if file is not None:
image = PIL.Image.open(file)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
else:
image = PIL.Image.open(filename)
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(filename, config=f"-l '{ocr_languages}'")
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
metadata_last_modified=metadata_last_modified,
)
width, height = image.size
add_pytesseract_bbox_to_elements(elements, bboxes, width, height)

else:
elements = []
Expand All @@ -499,6 +564,8 @@ def _partition_pdf_or_image_with_ocr(
last_modified=metadata_last_modified,
)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
width, height = image.size

_elements = partition_text(
text=text,
Expand All @@ -509,6 +576,8 @@ def _partition_pdf_or_image_with_ocr(
element.metadata = metadata
elements.append(element)

add_pytesseract_bbox_to_elements(elements, bboxes, width, height)

if include_page_breaks:
elements.append(PageBreak(text=""))
return elements

0 comments on commit 5872fa2

Please sign in to comment.