diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index bc8bdcc6b0..3185eb61c1 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -287,6 +287,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars( max_characters=max_characters, ) +def test_chunk_by_title_drops_detection_class_prob(): + elements = [ + Title( + "A Great Day", + metadata=ElementMetadata( + detection_class_prob=0.5, + ), + ), + Text( + "Today is a great day.", + metadata=ElementMetadata( + detection_class_prob=0.62, + ), + ), + Text( + "It is sunny outside.", + metadata=ElementMetadata( + detection_class_prob=0.73, + ), + ), + Title( + "An Okay Day", + metadata=ElementMetadata( + detection_class_prob=0.84, + ), + ), + Text( + "Today is an okay day.", + metadata=ElementMetadata( + detection_class_prob=0.95, + ), + ), + ] + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + assert str(chunks[0]) == str( + CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."), + ) + assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day.")) + + def test_chunk_by_title_drops_extra_metadata(): elements = [ diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 0c5bde799c..f24dceab61 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -197,7 +197,7 @@ def _drop_extra_metadata( metadata_dict: Dict[str, Any], include_pages: bool = True, ) -> Dict[str, Any]: - keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"] + keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth", "detection_class_prob"] if not include_pages and "page_number" in metadata_dict: keys_to_drop.append("page_number")