diff --git a/CHANGELOG.md b/CHANGELOG.md index 7324722691..a37fea2809 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. +### Features + +### Fixes + +* **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable. Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf. + ## 0.10.17-dev3 ### Enhancements @@ -20,7 +26,6 @@ Fix: Updated code to deal with these cases. Importance: This will ensure the correctness when partitioning HTML and Markdown documents. - ## 0.10.18 ### Enhancements diff --git a/example-docs/interface-config-guide-p93.pdf b/example-docs/interface-config-guide-p93.pdf new file mode 100644 index 0000000000..db41a7cae4 Binary files /dev/null and b/example-docs/interface-config-guide-p93.pdf differ diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 6cf4e93894..e14a793a2a 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -907,7 +907,7 @@ def test_combine_numbered_list(filename): "filename", ["example-docs/layout-parser-paper-fast.pdf"], ) -def test_hyperlinks(filename): +def test_partition_pdf_hyperlinks(filename): elements = pdf.partition_pdf(filename=filename, strategy="auto") links = [ { @@ -933,7 +933,7 @@ def test_hyperlinks(filename): "filename", ["example-docs/embedded-link.pdf"], ) -def test_hyperlinks_multiple_lines(filename): +def test_partition_pdf_hyperlinks_multiple_lines(filename): elements = pdf.partition_pdf(filename=filename, strategy="auto") assert elements[-1].metadata.links[-1]["text"] == "capturing" assert len(elements[-1].metadata.links) == 2 @@ -953,3 +953,13 @@ def test_partition_pdf_uses_model_name(): mockpartition.assert_called_once() assert "model_name" in mockpartition.call_args.kwargs assert mockpartition.call_args.kwargs["model_name"] + + +def test_partition_pdf_word_bbox_not_char( + filename="example-docs/interface-config-guide-p93.pdf", +): + try: + elements = pdf.partition_pdf(filename=filename) + except Exception as e: + raise ("Partitioning fail: %s" % e) + assert len(elements) == 17 diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 4cfa6b044a..5c1c3cbfd4 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -868,6 +868,23 @@ def get_uris( coordinate_system: Union[PixelSpace, PointSpace], page_number: int, ) -> List[dict]: + """ + Extracts URI annotations from a single or a list of PDF object references on a specific page. + The type of annots (list or not) depends on the pdf formatting. The function detectes the type + of annots and then pass on to get_uris_from_annots function as a List. + + Args: + annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references + representing annotations on the page. + height (float): The height of the page in the specified coordinate system. + coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + List[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ if isinstance(annots, List): return get_uris_from_annots(annots, height, coordinate_system, page_number) return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number) @@ -879,6 +896,21 @@ def get_uris_from_annots( coordinate_system: Union[PixelSpace, PointSpace], page_number: int, ) -> List[dict]: + """ + Extracts URI annotations from a list of PDF object references. + + Args: + annots (List[PDFObjRef]): A list of PDF object references representing annotations on + a page. + height (Union[int, float]): The height of the page in the specified coordinate system. + coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + List[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ annotation_list = [] for annotation in annots: annotation_dict = try_resolve(annotation) @@ -916,6 +948,10 @@ def get_uris_from_annots( def try_resolve(annot: PDFObjRef): + """ + Attempt to resolve a PDF object reference. If successful, returns the resolved object; + otherwise, returns the original reference. + """ try: return annot.resolve() except Exception: @@ -926,6 +962,19 @@ def rect_to_bbox( rect: Tuple[float, float, float, float], height: float, ) -> Tuple[float, float, float, float]: + """ + Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified + coordinate system where the vertical axis is measured from the top of the page. + + Args: + rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle + coordinates (x1, y1, x2, y2). + height (float): The height of the page in the specified coordinate system. + + Returns: + Tuple[float, float, float, float]: A tuple representing the bounding box coordinates + (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page. + """ x1, y2, x2, y1 = rect y1 = height - y1 y2 = height - y2 @@ -936,6 +985,19 @@ def calculate_intersection_area( bbox1: Tuple[float, float, float, float], bbox2: Tuple[float, float, float, float], ) -> float: + """ + Calculate the area of intersection between two bounding boxes. + + Args: + bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box + in the format (x1, y1, x2, y2). + bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of intersection between the two bounding boxes. If there is no + intersection, the function returns 0.0. + """ x1_1, y1_1, x2_1, y2_1 = bbox1 x1_2, y1_2, x2_2, y2_2 = bbox2 @@ -954,6 +1016,16 @@ def calculate_intersection_area( def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float: + """ + Calculate the area of a bounding box. + + Args: + bbox (Tuple[float, float, float, float]): The coordinates of the bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of the bounding box, computed as the product of its width and height. + """ x1, y1, x2, y2 = bbox area = (x2 - x1) * (y2 - y1) return area @@ -965,6 +1037,24 @@ def check_annotations_within_element( page_number: int, threshold: float = 0.9, ) -> List[dict]: + """ + Filter annotations that are within or highly overlap with a specified element on a page. + + Args: + annotation_list (List[dict]): A list of dictionaries, each containing information + about an annotation. + element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the + specified element in the bbox format (x1, y1, x2, y2). + page_number (int): The page number to which the annotations and element belong. + threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines + the minimum overlap required for an annotation to be considered within the element. + Default is 0.9. + + Returns: + List[dict]: A list of dictionaries containing information about annotations that are + within or highly overlap with the specified element on the given page, based on the + specified threshold. + """ annotations_within_element = [] for annotation in annotation_list: if annotation["page_number"] == page_number and ( @@ -980,6 +1070,19 @@ def get_word_bounding_box_from_element( obj: LTTextBox, height: float, ) -> Tuple[List[LTChar], List[dict]]: + """ + Extracts characters and word bounding boxes from a PDF text element. + + Args: + obj (LTTextBox): The PDF text element from which to extract characters and words. + height (float): The height of the page in the specified coordinate system. + + Returns: + Tuple[List[LTChar], List[dict]]: A tuple containing two lists: + - List[LTChar]: A list of LTChar objects representing individual characters. + - List[dict]: A list of dictionaries, each containing information about a word, + including its text, bounding box, and start index in the element's text. + """ characters = [] words = [] text_len = 0 @@ -1002,10 +1105,9 @@ def get_word_bounding_box_from_element( # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 # will need to switch to some pattern matching once we support more languages - if index == 0: + if not word: isalnum = char.isalnum() - - if char.isalnum() != isalnum: + if word and char.isalnum() != isalnum: isalnum = char.isalnum() words.append( {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, @@ -1028,6 +1130,19 @@ def get_word_bounding_box_from_element( def map_bbox_and_index(words: List[dict], annot: dict): + """ + Maps a bounding box annotation to the corresponding text and start index within a list of words. + + Args: + words (List[dict]): A list of dictionaries, each containing information about a word, + including its text, bounding box, and start index. + annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and + "start_index" fields. + + Returns: + dict: The updated annotation dictionary with "text" representing the mapped text and + "start_index" representing the start index of the mapped text in the list of words. + """ if len(words) == 0: annot["text"] = "" annot["start_index"] = -1 @@ -1059,6 +1174,16 @@ def map_bbox_and_index(words: List[dict], annot: dict): def try_argmin(array: np.ndarray) -> int: + """ + Attempt to find the index of the minimum value in a NumPy array. + + Args: + array (np.ndarray): The NumPy array in which to find the minimum value's index. + + Returns: + int: The index of the minimum value in the array. If the array is empty or an + IndexError occurs, it returns -1. + """ try: return int(np.argmin(array)) except IndexError: