From f7def03d55c4f46e327d1751a3e1451a2fc5d2cb Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Mon, 24 Jul 2023 12:22:56 -0700 Subject: [PATCH] Fix/521 pdf2image memory error hi res (#948) This PR is to reflect changes in the unstructured-inference PR #152 * Update functionality to retrieve image metadata from a page for document_to_element_list --- .gitignore | 2 + CHANGELOG.md | 4 +- test_unstructured/file_utils/test_filetype.py | 7 +++ unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 44 ++++++++++++++++--- 5 files changed, 52 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 0effce2069..873213e8c9 100644 --- a/.gitignore +++ b/.gitignore @@ -186,3 +186,5 @@ tags # Ruff cache .ruff_cache/ + +unstructured-inference/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 5092058885..b3c2d28752 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.8.2-dev4 +## 0.8.2-dev5 ### Enhancements +* Update functionality to retrieve image metadata from a page for `document_to_element_list` * Links are now tracked in `partition_html` output. * Set the file's current position to the beginning after reading the file in `convert_to_bytes` * Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split. @@ -17,6 +18,7 @@ ### Fixes +* Use the `image_metadata` property of the `PageLayout` instance to get the page image info in the `document_to_element_list` * Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy * Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy * Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index e8dc75a3a9..105bcfe029 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -11,6 +11,7 @@ from unstructured.file_utils import filetype from unstructured.file_utils.filetype import ( FileType, + _get_page_image_metadata, _is_code_mime_type, _is_text_file_a_csv, _is_text_file_a_json, @@ -469,3 +470,9 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): layout_elem_absent_coordinates = MockDocumentLayout() elements = document_to_element_list(layout_elem_absent_coordinates) assert elements[0].metadata.coordinates is None + + +def test_get_page_image_metadata_and_coordinate_system(): + doc = MockDocumentLayout() + metadata = _get_page_image_metadata(doc.pages[0]) + assert type(metadata) == dict diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1866fb8806..4ba261c4ad 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.2-dev4" # pragma: no cover +__version__ = "0.8.2-dev5" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 56579beb40..80cf56cc65 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -20,7 +20,7 @@ ) if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout + from unstructured_inference.inference.layout import DocumentLayout, PageLayout try: import magic @@ -466,14 +466,20 @@ def document_to_element_list( num_pages = len(document.pages) for i, page in enumerate(document.pages): page_elements: List[Element] = [] + + page_image_metadata = _get_page_image_metadata(page) + image_format = page_image_metadata.get("format") + image_width = page_image_metadata.get("width") + image_height = page_image_metadata.get("height") + for layout_element in page.elements: - if hasattr(page, "image") and hasattr(layout_element, "coordinates"): - image_format = page.image.format - coordinate_system = PixelSpace(width=page.image.width, height=page.image.height) + if image_width and image_height and hasattr(layout_element, "coordinates"): + coordinate_system = PixelSpace(width=image_width, height=image_height) else: - image_format = None coordinate_system = None + element = normalize_layout_element(layout_element, coordinate_system=coordinate_system) + if isinstance(element, List): for el in element: el.metadata.page_number = i + 1 @@ -514,6 +520,34 @@ def document_to_element_list( return elements +def _get_page_image_metadata( + page: PageLayout, +) -> dict: + """Retrieve image metadata and coordinate system from a page.""" + + image = getattr(page, "image", None) + image_metadata = getattr(page, "image_metadata", None) + + if image: + image_format = image.format + image_width = image.width + image_height = image.height + elif image_metadata: + image_format = image_metadata.get("format") + image_width = image_metadata.get("width") + image_height = image_metadata.get("height") + else: + image_format = None + image_width = None + image_height = None + + return { + "format": image_format, + "width": image_width, + "height": image_height, + } + + PROGRAMMING_LANGUAGES = [ "javascript", "python",