Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix various cases of HTML text missing after partition #1587

Merged
merged 9 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

### Fixes

* **Fix various cases of HTML text missing after partition**
Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
Fix: Updated code to deal with these cases.
Importance: This will ensure the correctness when partitioning HTML and Markdown documents.


## 0.10.18

Expand Down
40 changes: 38 additions & 2 deletions test_unstructured/documents/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from unstructured.documents.html import (
HEADING_TAGS,
LIST_ITEM_TAGS,
SECTION_TAGS,
TABLE_TAGS,
TEXT_TAGS,
HTMLDocument,
Expand All @@ -41,8 +42,15 @@

TAGS = TAGS.replace(">", "").split("<")[1:]

INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"]
EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS]
VOID_TAGS = "<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>"
VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:]

INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
EXCLUDED_TAGS = [
tag
for tag in TAGS
if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
]


@pytest.fixture()
Expand Down Expand Up @@ -685,3 +693,31 @@ def test_sample_doc_with_emoji():
# NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
# and the byte string representation when running locally on mac
assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]


def test_only_plain_text_in_body():
raw_html = "<body>Hello</body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"


def test_plain_text_before_anything_in_body():
raw_html = "<body>Hello<p>World</p></body>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"


def test_line_break_in_container():
raw_html = "<div>Hello<br/>World</div>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"


@pytest.mark.parametrize("tag", TEXT_TAGS)
def test_line_break_in_text_tag(tag):
raw_html = f"<{tag}>Hello<br/>World</{tag}>"
doc = HTMLDocument.from_string(raw_html)
assert doc.elements[0].text == "Hello"
assert doc.elements[1].text == "World"
12 changes: 9 additions & 3 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def _is_container_with_text(tag_elem: etree.Element) -> bool:
<div>Please read my message!</div>
</div>
"""
if tag_elem.tag not in SECTION_TAGS or len(tag_elem) == 0:
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
return False

if tag_elem.text is None or tag_elem.text.strip() == "":
Expand Down Expand Up @@ -451,6 +451,12 @@ def _has_break_tags(tag_elem: etree._Element) -> bool: # pyright: ignore[report

def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
unfurled = []

if tag_elem.text:
_tag_elem = etree.Element(tag_elem.tag)
_tag_elem.text = tag_elem.text
unfurled.append(_tag_elem)

children = tag_elem.getchildren()
for child in children:
if not _has_break_tags(child):
Expand All @@ -474,13 +480,13 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return False

if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
return True

# NOTE(robinson) - This indicates that a div tag has no children. If that's the
# case and the tag has text, its potential a text tag
children = tag_elem.getchildren()
if tag_elem.tag in SECTION_TAGS and len(children) == 0:
if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
return True

if _has_adjacent_bulleted_spans(tag_elem, children):
Expand Down
Loading