Skip to content

Commit

Permalink
Fix/521 pdf2image memory error hi res (#948)
Browse files Browse the repository at this point in the history
This PR is to reflect changes in the unstructured-inference PR #152

* Update functionality to retrieve image metadata from a page for document_to_element_list
  • Loading branch information
christinestraub authored Jul 24, 2023
1 parent 6e852cb commit f7def03
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,5 @@ tags

# Ruff cache
.ruff_cache/

unstructured-inference/
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.8.2-dev4
## 0.8.2-dev5

### Enhancements

* Update functionality to retrieve image metadata from a page for `document_to_element_list`
* Links are now tracked in `partition_html` output.
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
Expand All @@ -17,6 +18,7 @@

### Fixes

* Use the `image_metadata` property of the `PageLayout` instance to get the page image info in the `document_to_element_list`
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
* Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
Expand Down
7 changes: 7 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
FileType,
_get_page_image_metadata,
_is_code_mime_type,
_is_text_file_a_csv,
_is_text_file_a_json,
Expand Down Expand Up @@ -469,3 +470,9 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockDocumentLayout()
elements = document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None


def test_get_page_image_metadata_and_coordinate_system():
doc = MockDocumentLayout()
metadata = _get_page_image_metadata(doc.pages[0])
assert type(metadata) == dict
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.2-dev4" # pragma: no cover
__version__ = "0.8.2-dev5" # pragma: no cover
44 changes: 39 additions & 5 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)

if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout
from unstructured_inference.inference.layout import DocumentLayout, PageLayout

try:
import magic
Expand Down Expand Up @@ -466,14 +466,20 @@ def document_to_element_list(
num_pages = len(document.pages)
for i, page in enumerate(document.pages):
page_elements: List[Element] = []

page_image_metadata = _get_page_image_metadata(page)
image_format = page_image_metadata.get("format")
image_width = page_image_metadata.get("width")
image_height = page_image_metadata.get("height")

for layout_element in page.elements:
if hasattr(page, "image") and hasattr(layout_element, "coordinates"):
image_format = page.image.format
coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
if image_width and image_height and hasattr(layout_element, "coordinates"):
coordinate_system = PixelSpace(width=image_width, height=image_height)
else:
image_format = None
coordinate_system = None

element = normalize_layout_element(layout_element, coordinate_system=coordinate_system)

if isinstance(element, List):
for el in element:
el.metadata.page_number = i + 1
Expand Down Expand Up @@ -514,6 +520,34 @@ def document_to_element_list(
return elements


def _get_page_image_metadata(
page: PageLayout,
) -> dict:
"""Retrieve image metadata and coordinate system from a page."""

image = getattr(page, "image", None)
image_metadata = getattr(page, "image_metadata", None)

if image:
image_format = image.format
image_width = image.width
image_height = image.height
elif image_metadata:
image_format = image_metadata.get("format")
image_width = image_metadata.get("width")
image_height = image_metadata.get("height")
else:
image_format = None
image_width = None
image_height = None

return {
"format": image_format,
"width": image_width,
"height": image_height,
}


PROGRAMMING_LANGUAGES = [
"javascript",
"python",
Expand Down

0 comments on commit f7def03

Please sign in to comment.