Skip to content

Commit

Permalink
feat: add min_partition kwarg to that combines elements below a speci…
Browse files Browse the repository at this point in the history
…fied threshold (#926)

* add min_partition

* functioning _split_content_to_fit_min_max

* create test and make tidy/check

* fix rebase issues

* fix type hinting, remove unused code, add tests

* various changes and refactoring of methods

* add test, refactor, change var names for debugging purposes

* update test

* make tidy/check

* give more descriptive var names and add comments

* update xml partition via partition_text and create test

* fix <pre> bug for test_partition_html_with_pre_tag

* make tidy

* refactor and fix tests

* make tidy/check

* ingest-test-fixtures-update

* change list comprehension to for loop

* fix error check
  • Loading branch information
Coniferish authored Jul 24, 2023
1 parent d032912 commit 676c50a
Show file tree
Hide file tree
Showing 11 changed files with 314 additions and 45 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## 0.8.2-dev4

### Enhancements

* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx

Expand Down
14 changes: 14 additions & 0 deletions test_unstructured/documents/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ def test_from_string(sample_document):
assert type_tag.text.strip() == "10-K"


def test_from_string_with_pre_tag():
sample_document = """
<pre>
<SEC-DOCUMENT>
<TYPE>10-K
<COMPANY>Proctor & Gamble
</SEC-DOCUMENT>
</pre>
"""
xml_document = XMLDocument.from_string(sample_document)
type_tag = xml_document.document_tree.find(".//type")
assert type_tag.text.strip() == "10-K"


def test_read_with_stylesheet():
filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")
Expand Down
13 changes: 13 additions & 0 deletions test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
assert element.metadata.filename is None


def test_partition_email_from_text_file_max():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
with open(filename) as f:
elements = partition_email(file=f, content_source="text/plain", max_partition=20)
assert len(elements) == 6


def test_partition_email_from_text_file_raises_value_error():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
with pytest.raises(ValueError), open(filename) as f:
partition_email(file=f, content_source="text/plain", min_partition=1000)


def test_partition_email_from_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
with open(filename) as f:
Expand Down
1 change: 0 additions & 1 deletion test_unstructured/partition/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():

def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")

with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
Expand Down
104 changes: 103 additions & 1 deletion test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@

from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
from unstructured.partition.text import partition_text
from unstructured.partition.text import (
combine_paragraphs_less_than_min,
partition_text,
split_content_to_fit_max,
)

DIRECTORY = pathlib.Path(__file__).parent.resolve()

Expand All @@ -18,6 +22,31 @@
ListItem(text="I love fuzzy blankets"),
]

MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
because it is just being used as an example. Hi. Hello. Howdy. Hola.
The example is simple and repetitive and long and somewhat boring,
but it serves a purpose. End.""".replace(
"\n",
"",
)

SHORT_PARAGRAPHS = """This is a story.
This is a story that doesn't matter because it is just being used as an example.
Hi.
Hello.
Howdy.
Hola.
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
End.
"""


@pytest.mark.parametrize(
("filename", "encoding"),
Expand Down Expand Up @@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
assert elements[-1].text.endswith("External links")


def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
elements = partition_text(filename=filename)
elements_max_part = partition_text(filename=filename, max_partition=500)
assert len(elements) < len(elements_max_part)


def test_partition_text_min_max():
segments = partition_text(
text=SHORT_PARAGRAPHS,
min_partition=6,
)
expected = [
"This is a story.",
"This is a story that doesn't matter because it is just being used as an example.",
"Hi. Hello.",
"Howdy.",
"""Hola. The example is simple and repetitive and long and somewhat boring,
but it serves a purpose. End.""".replace(
"\n",
"",
),
]
for segment, test_segment in zip(segments, expected):
assert segment.text == test_segment

segments = partition_text(
text=SHORT_PARAGRAPHS,
max_partition=20,
min_partition=7,
)
expected = [
"This is a story.",
"This is a story that",
"doesn't matter",
"because it is just",
"being used as an",
"example.",
"Hi. Hello.",
"Howdy. Hola.",
"The example is",
"simple and",
"repetitive and long",
"and somewhat boring,",
"but it serves a",
"purpose. End.",
]
for segment, test_segment in zip(segments, expected):
assert segment.text == test_segment


def test_split_content_to_fit_max():
segments = split_content_to_fit_max(
content=MIN_MAX_TEXT,
max_partition=75,
)
assert segments == [
"This is a story.",
"This is a story that doesn't matter because",
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
"The example is simple and repetitive and long",
"and somewhat boring, but it serves a purpose. End.",
]


def test_combine_paragraphs_less_than_min():
segments = combine_paragraphs_less_than_min(
SHORT_PARAGRAPHS.split("\n\n"),
max_partition=1500,
min_partition=7,
)
assert len(segments) < len(SHORT_PARAGRAPHS)


def test_partition_text_doesnt_get_page_breaks():
text = "--------------------"
elements = partition_text(text=text)
Expand Down
9 changes: 6 additions & 3 deletions unstructured/documents/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from unstructured.logger import logger
from unstructured.partition.text import (
element_from_text,
split_by_paragraph,
partition_text,
)

VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
Expand Down Expand Up @@ -78,13 +78,16 @@ def _read_xml(self, content):
# Please use bytes input or XML fragments without declaration.
except ValueError:
document_tree = etree.fromstring(content.encode(), self.parser)

if "<pre>" and "</pre>" in content:
tree = etree.HTML(content)
for element in tree.xpath("//pre"):
if not element.text:
continue
text_content = split_by_paragraph(element.text)

text_content = []
for element in partition_text(text=element.text, paragraph_grouper=False):
text_content.append(element.text)

for text in text_content:
element = etree.Element("span")
element.text = str(element_from_text(text=text))
Expand Down
9 changes: 6 additions & 3 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.html import partition_html
from unstructured.partition.text import partition_text, split_by_paragraph
from unstructured.partition.text import partition_text

VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]

Expand Down Expand Up @@ -232,6 +232,7 @@ def partition_email(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
Expand All @@ -258,6 +259,9 @@ def partition_email(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing the text/plain content.
"""
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
Expand All @@ -270,7 +274,6 @@ def partition_email(

# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)

detected_encoding = "utf-8"
if filename is not None:
extracted_encoding, msg = parse_email(filename=filename)
Expand Down Expand Up @@ -342,12 +345,12 @@ def partition_email(
continue

elif content_source == "text/plain":
list_content = split_by_paragraph(content)
elements = partition_text(
text=content,
encoding=encoding,
max_partition=max_partition,
metadata_filename=metadata_filename or filename,
min_partition=min_partition,
)

for idx, element in enumerate(elements):
Expand Down
10 changes: 9 additions & 1 deletion unstructured/partition/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def partition_msg(
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Partitions a MSFT Outlook .msg file
Expand All @@ -42,6 +43,9 @@ def partition_msg(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing text/plain content.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -57,7 +61,11 @@ def partition_msg(
if "<html>" in text or "</div>" in text:
elements = partition_html(text=text)
else:
elements = partition_text(text=text, max_partition=max_partition)
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)

metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
for element in elements:
Expand Down
20 changes: 18 additions & 2 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def partition_pdf(
max_partition: Optional[int] = 1500,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -81,6 +82,9 @@ def partition_pdf(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied. Only applies to the "ocr_only" strategy.
min_partition
The minimum number of characters to include in a partition. Only applies if
processing text/plain content.
"""
exactly_one(filename=filename, file=file)
return partition_pdf_or_image(
Expand All @@ -91,6 +95,7 @@ def partition_pdf(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
max_partition=max_partition,
min_partition=min_partition,
**kwargs,
)

Expand All @@ -116,6 +121,7 @@ def partition_pdf_or_image(
infer_table_structure: bool = False,
ocr_languages: str = "eng",
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -172,6 +178,7 @@ def partition_pdf_or_image(
ocr_languages=ocr_languages,
is_image=is_image,
max_partition=max_partition,
min_partition=min_partition,
)

return layout_elements
Expand Down Expand Up @@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
ocr_languages: str = "eng",
is_image: bool = False,
max_partition: Optional[int] = 1500,
min_partition: Optional[int] = 0,
):
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
to an image prior to processing."""
Expand All @@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
else:
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
elements = partition_text(text=text, max_partition=max_partition)
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)
else:
elements = []
page_number = 0
Expand All @@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
metadata = ElementMetadata(filename=filename, page_number=page_number)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")

_elements = partition_text(text=text, max_partition=max_partition)
_elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
)
for element in _elements:
element.metadata = metadata
elements.append(element)
Expand Down
Loading

0 comments on commit 676c50a

Please sign in to comment.