fix: isalnum referenced before assignment (#1586)

**Executive Summary** Fix bug on the `get_word_bounding_box_from_element` function that prevent `partition_pdf` to run. **Technical Details** - The function originally first define `isalnum` on the first index. Now switched to conditional on flag value.
Unstructured-IO · Oct 3, 2023 · d6efd52 · d6efd52
1 parent b2e9976
commit d6efd52
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@
 * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
 * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
 
+### Features 
+
+### Fixes
+
+* **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable.  Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf.
+
 ## 0.10.17-dev3
 
 ### Enhancements
@@ -20,7 +26,6 @@
   Fix: Updated code to deal with these cases.
   Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
 
-
 ## 0.10.18
 
 ### Enhancements

diff --git a/example-docs/interface-config-guide-p93.pdf b/example-docs/interface-config-guide-p93.pdf
diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -907,7 +907,7 @@ def test_combine_numbered_list(filename):
     "filename",
     ["example-docs/layout-parser-paper-fast.pdf"],
 )
-def test_hyperlinks(filename):
+def test_partition_pdf_hyperlinks(filename):
     elements = pdf.partition_pdf(filename=filename, strategy="auto")
     links = [
         {
@@ -933,7 +933,7 @@ def test_hyperlinks(filename):
     "filename",
     ["example-docs/embedded-link.pdf"],
 )
-def test_hyperlinks_multiple_lines(filename):
+def test_partition_pdf_hyperlinks_multiple_lines(filename):
     elements = pdf.partition_pdf(filename=filename, strategy="auto")
     assert elements[-1].metadata.links[-1]["text"] == "capturing"
     assert len(elements[-1].metadata.links) == 2
@@ -953,3 +953,13 @@ def test_partition_pdf_uses_model_name():
         mockpartition.assert_called_once()
         assert "model_name" in mockpartition.call_args.kwargs
         assert mockpartition.call_args.kwargs["model_name"]
+
+
+def test_partition_pdf_word_bbox_not_char(
+    filename="example-docs/interface-config-guide-p93.pdf",
+):
+    try:
+        elements = pdf.partition_pdf(filename=filename)
+    except Exception as e:
+        raise ("Partitioning fail: %s" % e)
+    assert len(elements) == 17
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -868,6 +868,23 @@ def get_uris(
     coordinate_system: Union[PixelSpace, PointSpace],
     page_number: int,
 ) -> List[dict]:
+    """
+    Extracts URI annotations from a single or a list of PDF object references on a specific page.
+    The type of annots (list or not) depends on the pdf formatting. The function detectes the type
+    of annots and then pass on to get_uris_from_annots function as a List.
+
+    Args:
+        annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references
+            representing annotations on the page.
+        height (float): The height of the page in the specified coordinate system.
+        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
+            the annotations' coordinates.
+        page_number (int): The page number from which to extract annotations.
+
+    Returns:
+        List[dict]: A list of dictionaries, each containing information about a URI annotation,
+        including its coordinates, bounding box, type, URI link, and page number.
+    """
     if isinstance(annots, List):
         return get_uris_from_annots(annots, height, coordinate_system, page_number)
     return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number)
@@ -879,6 +896,21 @@ def get_uris_from_annots(
     coordinate_system: Union[PixelSpace, PointSpace],
     page_number: int,
 ) -> List[dict]:
+    """
+    Extracts URI annotations from a list of PDF object references.
+
+    Args:
+        annots (List[PDFObjRef]): A list of PDF object references representing annotations on
+            a page.
+        height (Union[int, float]): The height of the page in the specified coordinate system.
+        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
+            the annotations' coordinates.
+        page_number (int): The page number from which to extract annotations.
+
+    Returns:
+        List[dict]: A list of dictionaries, each containing information about a URI annotation,
+        including its coordinates, bounding box, type, URI link, and page number.
+    """
     annotation_list = []
     for annotation in annots:
         annotation_dict = try_resolve(annotation)
@@ -916,6 +948,10 @@ def get_uris_from_annots(
 
 
 def try_resolve(annot: PDFObjRef):
+    """
+    Attempt to resolve a PDF object reference. If successful, returns the resolved object;
+    otherwise, returns the original reference.
+    """
     try:
         return annot.resolve()
     except Exception:
@@ -926,6 +962,19 @@ def rect_to_bbox(
     rect: Tuple[float, float, float, float],
     height: float,
 ) -> Tuple[float, float, float, float]:
+    """
+    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
+    coordinate system where the vertical axis is measured from the top of the page.
+
+    Args:
+        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
+            coordinates (x1, y1, x2, y2).
+        height (float): The height of the page in the specified coordinate system.
+
+    Returns:
+        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
+        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
+    """
     x1, y2, x2, y1 = rect
     y1 = height - y1
     y2 = height - y2
@@ -936,6 +985,19 @@ def calculate_intersection_area(
     bbox1: Tuple[float, float, float, float],
     bbox2: Tuple[float, float, float, float],
 ) -> float:
+    """
+    Calculate the area of intersection between two bounding boxes.
+
+    Args:
+        bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box
+            in the format (x1, y1, x2, y2).
+        bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box
+            in the format (x1, y1, x2, y2).
+
+    Returns:
+        float: The area of intersection between the two bounding boxes. If there is no
+        intersection, the function returns 0.0.
+    """
     x1_1, y1_1, x2_1, y2_1 = bbox1
     x1_2, y1_2, x2_2, y2_2 = bbox2
 
@@ -954,6 +1016,16 @@ def calculate_intersection_area(
 
 
 def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:
+    """
+    Calculate the area of a bounding box.
+
+    Args:
+        bbox (Tuple[float, float, float, float]): The coordinates of the bounding box
+            in the format (x1, y1, x2, y2).
+
+    Returns:
+        float: The area of the bounding box, computed as the product of its width and height.
+    """
     x1, y1, x2, y2 = bbox
     area = (x2 - x1) * (y2 - y1)
     return area
@@ -965,6 +1037,24 @@ def check_annotations_within_element(
     page_number: int,
     threshold: float = 0.9,
 ) -> List[dict]:
+    """
+    Filter annotations that are within or highly overlap with a specified element on a page.
+
+    Args:
+        annotation_list (List[dict]): A list of dictionaries, each containing information
+            about an annotation.
+        element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
+            specified element in the bbox format (x1, y1, x2, y2).
+        page_number (int): The page number to which the annotations and element belong.
+        threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines
+            the minimum overlap required for an annotation to be considered within the element.
+            Default is 0.9.
+
+    Returns:
+        List[dict]: A list of dictionaries containing information about annotations that are
+        within or highly overlap with the specified element on the given page, based on the
+        specified threshold.
+    """
     annotations_within_element = []
     for annotation in annotation_list:
         if annotation["page_number"] == page_number and (
@@ -980,6 +1070,19 @@ def get_word_bounding_box_from_element(
     obj: LTTextBox,
     height: float,
 ) -> Tuple[List[LTChar], List[dict]]:
+    """
+    Extracts characters and word bounding boxes from a PDF text element.
+
+    Args:
+        obj (LTTextBox): The PDF text element from which to extract characters and words.
+        height (float): The height of the page in the specified coordinate system.
+
+    Returns:
+        Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
+            - List[LTChar]: A list of LTChar objects representing individual characters.
+            - List[dict]: A list of dictionaries, each containing information about a word,
+              including its text, bounding box, and start index in the element's text.
+    """
     characters = []
     words = []
     text_len = 0
@@ -1002,10 +1105,9 @@ def get_word_bounding_box_from_element(
 
                 # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
                 # will need to switch to some pattern matching once we support more languages
-                if index == 0:
+                if not word:
                     isalnum = char.isalnum()
-
-                if char.isalnum() != isalnum:
+                if word and char.isalnum() != isalnum:
                     isalnum = char.isalnum()
                     words.append(
                         {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
@@ -1028,6 +1130,19 @@ def get_word_bounding_box_from_element(
 
 
 def map_bbox_and_index(words: List[dict], annot: dict):
+    """
+    Maps a bounding box annotation to the corresponding text and start index within a list of words.
+
+    Args:
+        words (List[dict]): A list of dictionaries, each containing information about a word,
+            including its text, bounding box, and start index.
+        annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
+            "start_index" fields.
+
+    Returns:
+        dict: The updated annotation dictionary with "text" representing the mapped text and
+            "start_index" representing the start index of the mapped text in the list of words.
+    """
     if len(words) == 0:
         annot["text"] = ""
         annot["start_index"] = -1
@@ -1059,6 +1174,16 @@ def map_bbox_and_index(words: List[dict], annot: dict):
 
 
 def try_argmin(array: np.ndarray) -> int:
+    """
+    Attempt to find the index of the minimum value in a NumPy array.
+
+    Args:
+        array (np.ndarray): The NumPy array in which to find the minimum value's index.
+
+    Returns:
+        int: The index of the minimum value in the array. If the array is empty or an
+        IndexError occurs, it returns -1.
+    """
     try:
         return int(np.argmin(array))
     except IndexError: