Skip to content

Commit

Permalink
fix: Fix various cases of HTML text missing after partition (#1587)
Browse files Browse the repository at this point in the history
Fix 4 cases of text missing after partition:
1. Text immediately after `<body>`
```html
<body>
  missing1
  <div>hello</div>
</body>
```

2. Text inside container and immediately after `<br/>`
```html
<div>hello<br/>missing2</div>
```

3. Text immediately after a text opening tag, if said tag contains
`<br/>`
```html
<p>missing3<br/>hello</p>
```

4. Text inside `<body>` if it is the only content (different cause from
case 1)
```html
<body>missing4</body>
```

Also fix problem causing
`test_unstructured/documents/test_html.py::test_exclude_tag_types` to
not work as intended.

This will close GitHub Issue#1543
  • Loading branch information
unifyh authored Oct 3, 2023
1 parent 11cdd8d commit 89bd2fa
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

### Fixes

* **Fix various cases of HTML text missing after partition**
Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
Fix: Updated code to deal with these cases.
Importance: This will ensure the correctness when partitioning HTML and Markdown documents.


## 0.10.18

Expand Down
40 changes: 38 additions & 2 deletions test_unstructured/documents/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from unstructured.documents.html import (
HEADING_TAGS,
LIST_ITEM_TAGS,
SECTION_TAGS,
TABLE_TAGS,
TEXT_TAGS,
HTMLDocument,
Expand All @@ -41,8 +42,15 @@

TAGS = TAGS.replace(">", "").split("<")[1:]

INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"]
EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS]
VOID_TAGS = "<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>"
VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:]

INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
EXCLUDED_TAGS = [
tag
for tag in TAGS
if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
]


@pytest.fixture()
Expand Down Expand Up @@ -685,3 +693,31 @@ def test_sample_doc_with_emoji():
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
# and the byte string representation when running locally on mac
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]


def test_only_plain_text_in_body():
raw_html = "<body>Hello</body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"


def test_plain_text_before_anything_in_body():
raw_html = "<body>Hello<p>World</p></body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"


def test_line_break_in_container():
raw_html = "<div>Hello<br/>World</div>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"


@pytest.mark.parametrize("tag", TEXT_TAGS)
def test_line_break_in_text_tag(tag):
raw_html = f"<{tag}>Hello<br/>World</{tag}>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
12 changes: 9 additions & 3 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def _is_container_with_text(tag_elem: etree.Element) -> bool:
<div>Please read my message!</div>
</div>
"""
if tag_elem.tag not in SECTION_TAGS or len(tag_elem) == 0:
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
return False

if tag_elem.text is None or tag_elem.text.strip() == "":
Expand Down Expand Up @@ -451,6 +451,12 @@ def _has_break_tags(tag_elem: etree._Element) -> bool: # pyright: ignore[report

def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
unfurled = []

if tag_elem.text:
_tag_elem = etree.Element(tag_elem.tag)
_tag_elem.text = tag_elem.text
unfurled.append(_tag_elem)

children = tag_elem.getchildren()
for child in children:
if not _has_break_tags(child):
Expand All @@ -474,13 +480,13 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return False

if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
return True

# NOTE(robinson) - This indicates that a div tag has no children. If that's the
# case and the tag has text, its potential a text tag
children = tag_elem.getchildren()
if tag_elem.tag in SECTION_TAGS and len(children) == 0:
if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
return True

if _has_adjacent_bulleted_spans(tag_elem, children):
Expand Down

0 comments on commit 89bd2fa

Please sign in to comment.