Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Klaijan/fix: isalnum reference before assignment #1586

Merged
merged 11 commits into from
Oct 3, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

### Fixes

* **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable. Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf.

## 0.10.18

Expand Down
131 changes: 128 additions & 3 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,23 @@ def get_uris(
coordinate_system: Union[PixelSpace, PointSpace],
page_number: int,
) -> List[dict]:
"""
Extracts URI annotations from a single or a list of PDF object references on a specific page.
The type of annots (list or not) depends on the pdf formatting. The function detectes the type
of annots and then pass on to get_uris_from_annots function as a List.

Args:
annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references
representing annotations on the page.
height (float): The height of the page in the specified coordinate system.
coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
the annotations' coordinates.
page_number (int): The page number from which to extract annotations.

Returns:
List[dict]: A list of dictionaries, each containing information about a URI annotation,
including its coordinates, bounding box, type, URI link, and page number.
"""
if isinstance(annots, List):
return get_uris_from_annots(annots, height, coordinate_system, page_number)
return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number)
Expand All @@ -879,6 +896,21 @@ def get_uris_from_annots(
coordinate_system: Union[PixelSpace, PointSpace],
page_number: int,
) -> List[dict]:
"""
Extracts URI annotations from a list of PDF object references.

Args:
annots (List[PDFObjRef]): A list of PDF object references representing annotations on
a page.
height (Union[int, float]): The height of the page in the specified coordinate system.
coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
the annotations' coordinates.
page_number (int): The page number from which to extract annotations.

Returns:
List[dict]: A list of dictionaries, each containing information about a URI annotation,
including its coordinates, bounding box, type, URI link, and page number.
"""
annotation_list = []
for annotation in annots:
annotation_dict = try_resolve(annotation)
Expand Down Expand Up @@ -916,6 +948,10 @@ def get_uris_from_annots(


def try_resolve(annot: PDFObjRef):
"""
Attempt to resolve a PDF object reference. If successful, returns the resolved object;
otherwise, returns the original reference.
"""
try:
return annot.resolve()
except Exception:
Expand All @@ -926,6 +962,19 @@ def rect_to_bbox(
rect: Tuple[float, float, float, float],
height: float,
) -> Tuple[float, float, float, float]:
"""
Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
coordinate system where the vertical axis is measured from the top of the page.

Args:
rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
coordinates (x1, y1, x2, y2).
height (float): The height of the page in the specified coordinate system.

Returns:
Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
(x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
"""
x1, y2, x2, y1 = rect
y1 = height - y1
y2 = height - y2
Expand All @@ -936,6 +985,19 @@ def calculate_intersection_area(
bbox1: Tuple[float, float, float, float],
bbox2: Tuple[float, float, float, float],
) -> float:
"""
Calculate the area of intersection between two bounding boxes.

Args:
bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box
in the format (x1, y1, x2, y2).
bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box
in the format (x1, y1, x2, y2).

Returns:
float: The area of intersection between the two bounding boxes. If there is no
intersection, the function returns 0.0.
"""
x1_1, y1_1, x2_1, y2_1 = bbox1
x1_2, y1_2, x2_2, y2_2 = bbox2

Expand All @@ -954,6 +1016,16 @@ def calculate_intersection_area(


def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:
"""
Calculate the area of a bounding box.

Args:
bbox (Tuple[float, float, float, float]): The coordinates of the bounding box
in the format (x1, y1, x2, y2).

Returns:
float: The area of the bounding box, computed as the product of its width and height.
"""
x1, y1, x2, y2 = bbox
area = (x2 - x1) * (y2 - y1)
return area
Expand All @@ -965,6 +1037,24 @@ def check_annotations_within_element(
page_number: int,
threshold: float = 0.9,
) -> List[dict]:
"""
Filter annotations that are within or highly overlap with a specified element on a page.

Args:
annotation_list (List[dict]): A list of dictionaries, each containing information
about an annotation.
element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
specified element in the bbox format (x1, y1, x2, y2).
page_number (int): The page number to which the annotations and element belong.
threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines
the minimum overlap required for an annotation to be considered within the element.
Default is 0.9.

Returns:
List[dict]: A list of dictionaries containing information about annotations that are
within or highly overlap with the specified element on the given page, based on the
specified threshold.
"""
annotations_within_element = []
for annotation in annotation_list:
if annotation["page_number"] == page_number and (
Expand All @@ -980,6 +1070,19 @@ def get_word_bounding_box_from_element(
obj: LTTextBox,
height: float,
) -> Tuple[List[LTChar], List[dict]]:
"""
Extracts characters and word bounding boxes from a PDF text element.

Args:
obj (LTTextBox): The PDF text element from which to extract characters and words.
height (float): The height of the page in the specified coordinate system.

Returns:
Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
- List[LTChar]: A list of LTChar objects representing individual characters.
- List[dict]: A list of dictionaries, each containing information about a word,
including its text, bounding box, and start index in the element's text.
"""
characters = []
words = []
text_len = 0
Expand All @@ -1002,10 +1105,9 @@ def get_word_bounding_box_from_element(

# TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
# will need to switch to some pattern matching once we support more languages
if index == 0:
if not word:
isalnum = char.isalnum()

if char.isalnum() != isalnum:
if word and char.isalnum() != isalnum:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if i'm following this logic correctly, can this be simplified to:

isalnum = char.isalnum()
if word and isalnum == False:
    ....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition here is to use isalnum flag to help us split the word by alphanumeric and non-alphanumeric property in the text sentence. For example, Lorem ipsum dolor sit amet, consectetur "adipiscing" elit. would be split as

Lorem 
ipsum 
dolor 
sit 
amet
, 
consectetur
"
adipiscing
" 
elit
.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isalnum = char.isalnum()
words.append(
{"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
Expand All @@ -1028,6 +1130,19 @@ def get_word_bounding_box_from_element(


def map_bbox_and_index(words: List[dict], annot: dict):
"""
Maps a bounding box annotation to the corresponding text and start index within a list of words.

Args:
words (List[dict]): A list of dictionaries, each containing information about a word,
including its text, bounding box, and start index.
annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
"start_index" fields.

Returns:
dict: The updated annotation dictionary with "text" representing the mapped text and
"start_index" representing the start index of the mapped text in the list of words.
"""
if len(words) == 0:
annot["text"] = ""
annot["start_index"] = -1
Expand Down Expand Up @@ -1059,6 +1174,16 @@ def map_bbox_and_index(words: List[dict], annot: dict):


def try_argmin(array: np.ndarray) -> int:
"""
Attempt to find the index of the minimum value in a NumPy array.

Args:
array (np.ndarray): The NumPy array in which to find the minimum value's index.

Returns:
int: The index of the minimum value in the array. If the array is empty or an
IndexError occurs, it returns -1.
"""
try:
return int(np.argmin(array))
except IndexError:
Expand Down
Loading