diff --git a/CHANGELOG.md b/CHANGELOG.md index 5626918d67..d1e27ff6e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ ### Features +* Extract coordinates from PDFs and images when using OCR only strategy and add to metadata + ### Fixes * Update `partition_html` to respect the order of `
` tags. diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index 522132dddc..00b5cee2db 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -7,7 +7,6 @@ from pytesseract import TesseractError from unstructured_inference.inference import layout -from unstructured.documents.elements import Title from unstructured.partition import image, pdf DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -194,7 +193,7 @@ def test_partition_image_with_ocr_detects_korean(): strategy="ocr_only", ) - assert elements[0] == Title("RULES AND INSTRUCTIONS") + assert elements[0].text == "RULES AND INSTRUCTIONS" assert elements[3].text.replace(" ", "").startswith("안녕하세요") @@ -207,7 +206,7 @@ def test_partition_image_with_ocr_detects_korean_from_file(): strategy="ocr_only", ) - assert elements[0] == Title("RULES AND INSTRUCTIONS") + assert elements[0].text == "RULES AND INSTRUCTIONS" assert elements[3].text.replace(" ", "").startswith("안녕하세요") @@ -378,3 +377,17 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met ) assert elements[0].metadata.last_modified == expected_last_modification_date + + +def test_partition_image_with_ocr_has_coordinates_from_file( + mocker, + filename="example-docs/english-and-korean.png", +): + mocked_last_modification_date = "2029-07-05T09:24:28" + mocker.patch( + "unstructured.partition.pdf.get_last_modified_date", + return_value=mocked_last_modification_date, + ) + elements = image.partition_image(filename=filename, strategy="ocr_only") + int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points] + assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)] diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 736868af7c..68d9b9fc57 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -768,3 +768,31 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date( ) assert elements[0].metadata.last_modified == expected_last_modification_date + + +def test_partition_pdf_with_ocr_has_coordinates_from_filename( + filename="example-docs/chevron-page.pdf", +): + elements = pdf.partition_pdf(filename=filename, strategy="ocr_only") + assert elements[0].metadata.coordinates.points == [ + (657.0, 2144.0), + (657.0, 2106.0), + (1043.0, 2106.0), + (1043.0, 2144.0), + ] + + +def test_partition_pdf_with_ocr_has_coordinates_from_file( + filename="example-docs/chevron-page.pdf", +): + with open(filename, "rb") as f: + elements = pdf.partition_pdf( + file=f, + strategy="ocr_only", + ) + assert elements[0].metadata.coordinates.points == [ + (657.0, 2144.0), + (657.0, 2106.0), + (1043.0, 2106.0), + (1043.0, 2144.0), + ] diff --git a/unstructured/documents/coordinates.py b/unstructured/documents/coordinates.py index dc64c68462..6d1a489ef7 100644 --- a/unstructured/documents/coordinates.py +++ b/unstructured/documents/coordinates.py @@ -80,15 +80,15 @@ def __init__(self): class PixelSpace(CoordinateSystem): - """Coordinate system representing a pixel space, such as an image. The origin is at the bottom - right.""" + """Coordinate system representing a pixel space, such as an image. The origin is at the top + left.""" orientation = Orientation.SCREEN class PointSpace(CoordinateSystem): - """Coordinate system representing a point space, such as a pdf. The origin is at the top - right.""" + """Coordinate system representing a point space, such as a pdf. The origin is at the bottom + left.""" orientation = Orientation.CARTESIAN diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index a4667c9c3d..91f63fd86e 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -11,7 +11,7 @@ from pdfminer.utils import open_filename from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.coordinates import PixelSpace +from unstructured.documents.coordinates import PixelSpace, PointSpace from unstructured.documents.elements import ( CoordinatesMetadata, Element, @@ -460,6 +460,66 @@ def convert_pdf_to_images( yield image +def add_pytesseract_bbox_to_elements(elements, bboxes, width, height): + """ + Get the bounding box of each element and add it to element.metadata.coordinates + + Args: + elements: elements containing text detected by pytesseract.image_to_string. + bboxes (str): The return value of pytesseract.image_to_boxes. + """ + # (NOTE) jennings: This function was written with pytesseract in mind, but + # paddle returns similar values via `ocr.ocr(img)`. + # See more at issue #1176: https://github.com/Unstructured-IO/unstructured/issues/1176 + min_x = float("inf") + min_y = float("inf") + max_x = 0 + max_y = 0 + point_space = PointSpace( + width=width, + height=height, + ) + pixel_space = PixelSpace( + width=width, + height=height, + ) + + boxes = bboxes.strip().split("\n") + i = 0 + for element in elements: + char_count = len(element.text.replace(" ", "")) + + for box in boxes[i : i + char_count]: # noqa + _, x1, y1, x2, y2, _ = box.split() + x1, y1, x2, y2 = map(int, [x1, y1, x2, y2]) + + min_x = min(min_x, x1) + min_y = min(min_y, y1) + max_x = max(max_x, x2) + max_y = max(max_y, y2) + + points = ((min_x, min_y), (min_x, max_y), (max_x, max_y), (max_x, min_y)) + converted_points = [] + for point in points: + x, y = point + new_x, new_y = point_space.convert_coordinates_to_new_system(pixel_space, x, y) + converted_points.append((new_x, new_y)) + + element.metadata.coordinates = CoordinatesMetadata( + points=converted_points, + system=pixel_space, + ) + + # reset for next element + min_x = float("inf") + min_y = float("inf") + max_x = 0 + max_y = 0 + i += char_count + + return elements + + @requires_dependencies("pytesseract") def _partition_pdf_or_image_with_ocr( filename: str = "", @@ -471,7 +531,7 @@ def _partition_pdf_or_image_with_ocr( min_partition: Optional[int] = 0, metadata_last_modified: Optional[str] = None, ): - """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted + """Partitions an image or PDF using Tesseract OCR. For PDFs, each page is converted to an image prior to processing.""" import pytesseract @@ -479,14 +539,19 @@ def _partition_pdf_or_image_with_ocr( if file is not None: image = PIL.Image.open(file) text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'") + bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'") else: + image = PIL.Image.open(filename) text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'") + bboxes = pytesseract.image_to_boxes(filename, config=f"-l '{ocr_languages}'") elements = partition_text( text=text, max_partition=max_partition, min_partition=min_partition, metadata_last_modified=metadata_last_modified, ) + width, height = image.size + add_pytesseract_bbox_to_elements(elements, bboxes, width, height) else: elements = [] @@ -499,6 +564,8 @@ def _partition_pdf_or_image_with_ocr( last_modified=metadata_last_modified, ) text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'") + bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'") + width, height = image.size _elements = partition_text( text=text, @@ -509,6 +576,8 @@ def _partition_pdf_or_image_with_ocr( element.metadata = metadata elements.append(element) + add_pytesseract_bbox_to_elements(elements, bboxes, width, height) + if include_page_breaks: elements.append(PageBreak(text="")) return elements