Skip to content

Commit

Permalink
ignore detection class prob when chunking
Browse files Browse the repository at this point in the history
  • Loading branch information
ryannikolaidis committed Oct 4, 2023
1 parent 13453d6 commit bab318e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
40 changes: 40 additions & 0 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
max_characters=max_characters,
)

def test_chunk_by_title_drops_detection_class_prob():
elements = [
Title(
"A Great Day",
metadata=ElementMetadata(
detection_class_prob=0.5,
),
),
Text(
"Today is a great day.",
metadata=ElementMetadata(
detection_class_prob=0.62,
),
),
Text(
"It is sunny outside.",
metadata=ElementMetadata(
detection_class_prob=0.73,
),
),
Title(
"An Okay Day",
metadata=ElementMetadata(
detection_class_prob=0.84,
),
),
Text(
"Today is an okay day.",
metadata=ElementMetadata(
detection_class_prob=0.95,
),
),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert str(chunks[0]) == str(
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
)
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))



def test_chunk_by_title_drops_extra_metadata():
elements = [
Expand Down
2 changes: 1 addition & 1 deletion unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _drop_extra_metadata(
metadata_dict: Dict[str, Any],
include_pages: bool = True,
) -> Dict[str, Any]:
keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"]
keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth", "detection_class_prob"]
if not include_pages and "page_number" in metadata_dict:
keys_to_drop.append("page_number")

Expand Down

0 comments on commit bab318e

Please sign in to comment.