fix pdf partition of list items being detected as titles in OCR only …

…mode (#1119) Closes Github issue #1010 adds group_bullet_paragraph func to handle grouping of bullet items that are split across multiple lines
Unstructured-IO · Aug 15, 2023 · 6e5d27c · 6e5d27c
1 parent cb923b9
commit 6e5d27c
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## 0.9.4-dev0
+
+
+### Enhancements
+
+
+### Features
+
+
+### Fixes
+* fix pdf partition of list items being detected as titles in OCR only mode
+
 ## 0.9.3
 
 ### Enhancements
@@ -18,6 +30,7 @@
 
 ### Fixes
 
+* fix pdf partition of list items being detected as titles in OCR only mode
 * make notion module discoverable
 * fix emails with `Content-Distribution: inline` and `Content-Distribution: attachment` with no filename
 * Fix email attachment filenames which had `=` in the filename itself

diff --git a/example-docs/list-item-example.pdf b/example-docs/list-item-example.pdf
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
@@ -209,6 +209,30 @@ def test_group_broken_paragraphs_non_default_settings():
     )
 
 
+def test_group_broken_paragraphs_with_bullets():
+    text = """○The big red fox
+is walking down the lane.
+
+○At the end of the lane
+the fox met a friendly bear."""
+    assert core.group_bullet_paragraph(text) == [
+        "○The big red fox is walking down the lane. ",
+        "○At the end of the lane the fox met a friendly bear.",
+    ]
+
+
+def test_group_bullet_paragraph_with_e_bullets():
+    text = """e The big red fox
+is walking down the lane.
+
+e At the end of the lane
+the fox met a friendly bear."""
+    assert core.group_bullet_paragraph(text) == [
+        "· The big red fox is walking down the lane. ",
+        "· At the end of the lane the fox met a friendly bear.",
+    ]
+
+
 @pytest.mark.parametrize(
     # NOTE(yuming): Tests combined cleaners
     (

diff --git a/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json b/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json
@@ -604,30 +604,12 @@
   },
   {
     "type": "ListItem",
-    "element_id": "19d8999a73cdba6f00747d214f1ef31b",
+    "element_id": "d5bf23fd7f622ba14b5dd626721d0388",
     "metadata": {
       "data_source": {},
       "filetype": "text/plain"
     },
-    "text": "NOTE:  A \"print position\" may contain several characters"
-  },
-  {
-    "type": "NarrativeText",
-    "element_id": "85231efbe0f659114b09387f8dfd695f",
-    "metadata": {
-      "data_source": {},
-      "filetype": "text/plain"
-    },
-    "text": "which are the result of overstrikes, or of sequences such as"
-  },
-  {
-    "type": "Title",
-    "element_id": "17548c22a23e66db7798c82919b49bc0",
-    "metadata": {
-      "data_source": {},
-      "filetype": "text/plain"
-    },
-    "text": "<char1> BS <char2>..."
+    "text": "NOTE:  A \"print position\" may contain several characters which are the result of overstrikes, or of sequences such as <char1> BS <char2>..."
   },
   {
     "type": "Title",

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.9.3"  # pragma: no cover
+__version__ = "0.9.4-dev0"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -8,10 +8,12 @@
 )
 from unstructured.nlp.patterns import (
     DOUBLE_PARAGRAPH_PATTERN_RE,
+    E_BULLET_PATTERN,
     LINE_BREAK_RE,
     PARAGRAPH_PATTERN,
     PARAGRAPH_PATTERN_RE,
     UNICODE_BULLETS_RE,
+    UNICODE_BULLETS_RE_0W,
 )
 
 
@@ -66,6 +68,34 @@ def clean_ordered_bullets(text) -> str:
     return text_cl
 
 
+def group_bullet_paragraph(paragraph: str) -> list:
+    """Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
+    For example:
+
+    '''○ The big red fox
+    is walking down the lane.
+
+    ○ At the end of the lane
+    the fox met a friendly bear.'''
+
+    Gets converted to
+
+    '''○ The big red fox is walking down the lane.
+    ○ At the end of the land the fox met a bear.'''
+    """
+    clean_paragraphs = []
+    # pytesseract converts some bullet points to standalone "e" characters.
+    # Substitute "e" with bullets since they are later used in partition_text
+    # to determine list element type.
+    paragraph = (re.sub(E_BULLET_PATTERN, "·", paragraph)).strip()
+
+    bullet_paras = re.split(UNICODE_BULLETS_RE_0W, paragraph)
+    for bullet in bullet_paras:
+        if bullet:
+            clean_paragraphs.append(re.sub(PARAGRAPH_PATTERN, " ", bullet))
+    return clean_paragraphs
+
+
 def group_broken_paragraphs(
     text: str,
     line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
@@ -97,8 +127,9 @@ def group_broken_paragraphs(
         #     http://www.apache.org/licenses/
         para_split = line_split.split(paragraph)
         all_lines_short = all(len(line.strip().split(" ")) < 5 for line in para_split)
-        if UNICODE_BULLETS_RE.match(paragraph.strip()):
-            clean_paragraphs.extend(re.split(PARAGRAPH_PATTERN, paragraph))
+        # pytesseract converts some bullet points to standalone "e" characters
+        if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
+            clean_paragraphs.extend(group_bullet_paragraph(paragraph))
         elif all_lines_short:
             clean_paragraphs.extend([line for line in para_split if line.strip()])
         else:

diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
@@ -59,6 +59,9 @@
 ]
 BULLETS_PATTERN = "|".join(UNICODE_BULLETS)
 UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")
+# zero-width positive lookahead so bullet characters will not be removed when using .split()
+UNICODE_BULLETS_RE_0W = re.compile(f"(?={BULLETS_PATTERN})(?<!{BULLETS_PATTERN})")
+E_BULLET_PATTERN = re.compile(r"^e(?=\s)", re.MULTILINE)
 
 # NOTE(klaijan) - Captures reference of format [1] or [i] or [a] at any point in the line.
 REFERENCE_PATTERN = r"\[(?:[\d]+|[a-z]|[ivxlcdm])\]"