Skip to content

Commit

Permalink
fix pdf partition of list items being detected as titles in OCR only …
Browse files Browse the repository at this point in the history
…mode (#1119)

Closes Github issue #1010

adds group_bullet_paragraph func to handle grouping of bullet items that are split across multiple lines
  • Loading branch information
Coniferish authored Aug 15, 2023
1 parent cb923b9 commit 6e5d27c
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 23 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
## 0.9.4-dev0


### Enhancements


### Features


### Fixes
* fix pdf partition of list items being detected as titles in OCR only mode

## 0.9.3

### Enhancements
Expand All @@ -18,6 +30,7 @@

### Fixes

* fix pdf partition of list items being detected as titles in OCR only mode
* make notion module discoverable
* fix emails with `Content-Distribution: inline` and `Content-Distribution: attachment` with no filename
* Fix email attachment filenames which had `=` in the filename itself
Expand Down
Binary file added example-docs/list-item-example.pdf
Binary file not shown.
24 changes: 24 additions & 0 deletions test_unstructured/cleaners/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,30 @@ def test_group_broken_paragraphs_non_default_settings():
)


def test_group_broken_paragraphs_with_bullets():
text = """○The big red fox
is walking down the lane.
○At the end of the lane
the fox met a friendly bear."""
assert core.group_bullet_paragraph(text) == [
"○The big red fox is walking down the lane. ",
"○At the end of the lane the fox met a friendly bear.",
]


def test_group_bullet_paragraph_with_e_bullets():
text = """e The big red fox
is walking down the lane.
e At the end of the lane
the fox met a friendly bear."""
assert core.group_bullet_paragraph(text) == [
"· The big red fox is walking down the lane. ",
"· At the end of the lane the fox met a friendly bear.",
]


@pytest.mark.parametrize(
# NOTE(yuming): Tests combined cleaners
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -604,30 +604,12 @@
},
{
"type": "ListItem",
"element_id": "19d8999a73cdba6f00747d214f1ef31b",
"element_id": "d5bf23fd7f622ba14b5dd626721d0388",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "NOTE: A \"print position\" may contain several characters"
},
{
"type": "NarrativeText",
"element_id": "85231efbe0f659114b09387f8dfd695f",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "which are the result of overstrikes, or of sequences such as"
},
{
"type": "Title",
"element_id": "17548c22a23e66db7798c82919b49bc0",
"metadata": {
"data_source": {},
"filetype": "text/plain"
},
"text": "<char1> BS <char2>..."
"text": "NOTE: A \"print position\" may contain several characters which are the result of overstrikes, or of sequences such as <char1> BS <char2>..."
},
{
"type": "Title",
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.3" # pragma: no cover
__version__ = "0.9.4-dev0" # pragma: no cover
35 changes: 33 additions & 2 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
)
from unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
E_BULLET_PATTERN,
LINE_BREAK_RE,
PARAGRAPH_PATTERN,
PARAGRAPH_PATTERN_RE,
UNICODE_BULLETS_RE,
UNICODE_BULLETS_RE_0W,
)


Expand Down Expand Up @@ -66,6 +68,34 @@ def clean_ordered_bullets(text) -> str:
return text_cl


def group_bullet_paragraph(paragraph: str) -> list:
"""Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
For example:
'''○ The big red fox
is walking down the lane.
○ At the end of the lane
the fox met a friendly bear.'''
Gets converted to
'''○ The big red fox is walking down the lane.
○ At the end of the land the fox met a bear.'''
"""
clean_paragraphs = []
# pytesseract converts some bullet points to standalone "e" characters.
# Substitute "e" with bullets since they are later used in partition_text
# to determine list element type.
paragraph = (re.sub(E_BULLET_PATTERN, "·", paragraph)).strip()

bullet_paras = re.split(UNICODE_BULLETS_RE_0W, paragraph)
for bullet in bullet_paras:
if bullet:
clean_paragraphs.append(re.sub(PARAGRAPH_PATTERN, " ", bullet))
return clean_paragraphs


def group_broken_paragraphs(
text: str,
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
Expand Down Expand Up @@ -97,8 +127,9 @@ def group_broken_paragraphs(
# http://www.apache.org/licenses/
para_split = line_split.split(paragraph)
all_lines_short = all(len(line.strip().split(" ")) < 5 for line in para_split)
if UNICODE_BULLETS_RE.match(paragraph.strip()):
clean_paragraphs.extend(re.split(PARAGRAPH_PATTERN, paragraph))
# pytesseract converts some bullet points to standalone "e" characters
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
clean_paragraphs.extend(group_bullet_paragraph(paragraph))
elif all_lines_short:
clean_paragraphs.extend([line for line in para_split if line.strip()])
else:
Expand Down
3 changes: 3 additions & 0 deletions unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@
]
BULLETS_PATTERN = "|".join(UNICODE_BULLETS)
UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")
# zero-width positive lookahead so bullet characters will not be removed when using .split()
UNICODE_BULLETS_RE_0W = re.compile(f"(?={BULLETS_PATTERN})(?<!{BULLETS_PATTERN})")
E_BULLET_PATTERN = re.compile(r"^e(?=\s)", re.MULTILINE)

# NOTE(klaijan) - Captures reference of format [1] or [i] or [a] at any point in the line.
REFERENCE_PATTERN = r"\[(?:[\d]+|[a-z]|[ivxlcdm])\]"
Expand Down

0 comments on commit 6e5d27c

Please sign in to comment.