fix: chunking fails with detection_class_prob in metadata (#1637)

Unstructured-IO · Oct 4, 2023 · 9960ce5 · 9960ce5
1 parent 0a65fc2
commit 9960ce5
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.19-dev10
+## 0.10.19-dev11
 
 ### Enhancements
 
@@ -20,6 +20,7 @@
   Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
   Fix: Updated code to deal with these cases.
   Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
+* **Fixes chunking when `detection_class_prob` appears in Element metadata** Problem: when `detection_class_prob` appears in Element metadata, Elements will only be combined by chunk_by_title if they have the same `detection_class_prob` value (which is rare). This is unlikely a case we ever need to support and most often results in no chunking. Fix: `detection_class_prob` is included in the chunking list of metadata keys excluded for similarity comparison. Importance: This change allows `chunk_by_title` to operate as intended for documents which include `detection_class_prob` metadata in their Elements.
 
 ## 0.10.18
 

diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -288,6 +288,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
         )
 
 
+def test_chunk_by_title_drops_detection_class_prob():
+    elements = [
+        Title(
+            "A Great Day",
+            metadata=ElementMetadata(
+                detection_class_prob=0.5,
+            ),
+        ),
+        Text(
+            "Today is a great day.",
+            metadata=ElementMetadata(
+                detection_class_prob=0.62,
+            ),
+        ),
+        Text(
+            "It is sunny outside.",
+            metadata=ElementMetadata(
+                detection_class_prob=0.73,
+            ),
+        ),
+        Title(
+            "An Okay Day",
+            metadata=ElementMetadata(
+                detection_class_prob=0.84,
+            ),
+        ),
+        Text(
+            "Today is an okay day.",
+            metadata=ElementMetadata(
+                detection_class_prob=0.95,
+            ),
+        ),
+    ]
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
+    assert str(chunks[0]) == str(
+        CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
+    )
+    assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
+
+
 def test_chunk_by_title_drops_extra_metadata():
     elements = [
         Title(

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.19-dev10"  # pragma: no cover
+__version__ = "0.10.19-dev11"  # pragma: no cover
diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
@@ -197,7 +197,14 @@ def _drop_extra_metadata(
     metadata_dict: Dict[str, Any],
     include_pages: bool = True,
 ) -> Dict[str, Any]:
-    keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"]
+    keys_to_drop = [
+        "element_id",
+        "type",
+        "coordinates",
+        "parent_id",
+        "category_depth",
+        "detection_class_prob",
+    ]
     if not include_pages and "page_number" in metadata_dict:
         keys_to_drop.append("page_number")