From 6fa0c09d62e1a5f93fe452726ee0ed3b5c11619b Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 30 Oct 2024 00:53:25 -0700 Subject: [PATCH] feat: enhance word extraction from PDFMiner objects --- unstructured/partition/pdf_image/pdfminer_processing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 9821379092..91a3e689f2 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -575,6 +575,11 @@ def get_words_from_obj( y2 = height - character.y0 word += char + else: + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" text_len += len(text_line) return characters, words