fix: Fix various cases of HTML text missing after partition (#1587)

Fix 4 cases of text missing after partition: 1. Text immediately after `<body>` ```html <body> missing1 <div>hello</div> </body> ``` 2. Text inside container and immediately after `<br/>` ```html <div>hello<br/>missing2</div> ``` 3. Text immediately after a text opening tag, if said tag contains `<br/>` ```html <p>missing3<br/>hello</p> ``` 4. Text inside `<body>` if it is the only content (different cause from case 1) ```html <body>missing4</body> ``` Also fix problem causing `test_unstructured/documents/test_html.py::test_exclude_tag_types` to not work as intended. This will close GitHub Issue#1543
Unstructured-IO · Oct 3, 2023 · 89bd2fa · 89bd2fa
1 parent 11cdd8d
commit 89bd2fa
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,11 @@
 
 ### Fixes
 
+* **Fix various cases of HTML text missing after partition**
+  Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
+  Fix: Updated code to deal with these cases.
+  Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
+
 
 ## 0.10.18
 

diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
@@ -17,6 +17,7 @@
 from unstructured.documents.html import (
     HEADING_TAGS,
     LIST_ITEM_TAGS,
+    SECTION_TAGS,
     TABLE_TAGS,
     TEXT_TAGS,
     HTMLDocument,
@@ -41,8 +42,15 @@
 
 TAGS = TAGS.replace(">", "").split("<")[1:]
 
-INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"]
-EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS]
+VOID_TAGS = "<area><base><br><col><embed><hr><img><input><link><meta><param><source><track><wbr>"
+VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:]
+
+INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS
+EXCLUDED_TAGS = [
+    tag
+    for tag in TAGS
+    if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"])
+]
 
 
 @pytest.fixture()
@@ -685,3 +693,31 @@ def test_sample_doc_with_emoji():
     # NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners
     # and the byte string representation when running locally on mac
     assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"]
+
+
+def test_only_plain_text_in_body():
+    raw_html = "<body>Hello</body>"
+    doc = HTMLDocument.from_string(raw_html)
+    assert doc.elements[0].text == "Hello"
+
+
+def test_plain_text_before_anything_in_body():
+    raw_html = "<body>Hello<p>World</p></body>"
+    doc = HTMLDocument.from_string(raw_html)
+    assert doc.elements[0].text == "Hello"
+    assert doc.elements[1].text == "World"
+
+
+def test_line_break_in_container():
+    raw_html = "<div>Hello<br/>World</div>"
+    doc = HTMLDocument.from_string(raw_html)
+    assert doc.elements[0].text == "Hello"
+    assert doc.elements[1].text == "World"
+
+
+@pytest.mark.parametrize("tag", TEXT_TAGS)
+def test_line_break_in_text_tag(tag):
+    raw_html = f"<{tag}>Hello<br/>World</{tag}>"
+    doc = HTMLDocument.from_string(raw_html)
+    assert doc.elements[0].text == "Hello"
+    assert doc.elements[1].text == "World"
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
@@ -417,7 +417,7 @@ def _is_container_with_text(tag_elem: etree.Element) -> bool:
         <div>Please read my message!</div>
     </div>
     """
-    if tag_elem.tag not in SECTION_TAGS or len(tag_elem) == 0:
+    if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
         return False
 
     if tag_elem.text is None or tag_elem.text.strip() == "":
@@ -451,6 +451,12 @@ def _has_break_tags(tag_elem: etree._Element) -> bool:  # pyright: ignore[report
 
 def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
     unfurled = []
+
+    if tag_elem.text:
+        _tag_elem = etree.Element(tag_elem.tag)
+        _tag_elem.text = tag_elem.text
+        unfurled.append(_tag_elem)
+
     children = tag_elem.getchildren()
     for child in children:
         if not _has_break_tags(child):
@@ -474,13 +480,13 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
     if len(tag_elem) > max_predecessor_len + empty_elems_len:
         return False
 
-    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
+    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
         return True
 
     # NOTE(robinson) - This indicates that a div tag has no children. If that's the
     # case and the tag has text, its potential a text tag
     children = tag_elem.getchildren()
-    if tag_elem.tag in SECTION_TAGS and len(children) == 0:
+    if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
         return True
 
     if _has_adjacent_bulleted_spans(tag_elem, children):