feat: add min_partition kwarg to that combines elements below a speci…

…fied threshold (#926) * add min_partition * functioning _split_content_to_fit_min_max * create test and make tidy/check * fix rebase issues * fix type hinting, remove unused code, add tests * various changes and refactoring of methods * add test, refactor, change var names for debugging purposes * update test * make tidy/check * give more descriptive var names and add comments * update xml partition via partition_text and create test * fix <pre> bug for test_partition_html_with_pre_tag * make tidy * refactor and fix tests * make tidy/check * ingest-test-fixtures-update * change list comprehension to for loop * fix error check
Unstructured-IO · Jul 24, 2023 · 676c50a · 676c50a
1 parent d032912
commit 676c50a
Show file tree

Hide file tree

Showing 11 changed files with 314 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ## 0.8.2-dev4
 
 ### Enhancements
-
+* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 

diff --git a/test_unstructured/documents/test_xml.py b/test_unstructured/documents/test_xml.py
@@ -44,6 +44,20 @@ def test_from_string(sample_document):
     assert type_tag.text.strip() == "10-K"
 
 
+def test_from_string_with_pre_tag():
+    sample_document = """
+    <pre>
+    <SEC-DOCUMENT>
+    <TYPE>10-K
+    <COMPANY>Proctor & Gamble
+    </SEC-DOCUMENT>
+    </pre>
+    """
+    xml_document = XMLDocument.from_string(sample_document)
+    type_tag = xml_document.document_tree.find(".//type")
+    assert type_tag.text.strip() == "10-K"
+
+
 def test_read_with_stylesheet():
     filename = os.path.join(FILEPATH, "..", "..", "example-docs", "factbook.xml")
     stylesheet = os.path.join(FILEPATH, "..", "..", "example-docs", "unsupported", "factbook.xsl")

diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -241,6 +241,19 @@ def test_partition_email_from_text_file_with_headers():
         assert element.metadata.filename is None
 
 
+def test_partition_email_from_text_file_max():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with open(filename) as f:
+        elements = partition_email(file=f, content_source="text/plain", max_partition=20)
+    assert len(elements) == 6
+
+
+def test_partition_email_from_text_file_raises_value_error():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
+    with pytest.raises(ValueError), open(filename) as f:
+        partition_email(file=f, content_source="text/plain", min_partition=1000)
+
+
 def test_partition_email_from_text():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
     with open(filename) as f:

diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py
@@ -144,7 +144,6 @@ def test_partition_image_with_ocr_detects_korean():
 
 def test_partition_image_with_ocr_detects_korean_from_file():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
-
     with open(filename, "rb") as f:
         elements = image.partition_image(
             file=f,

diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -5,7 +5,11 @@
 
 from unstructured.cleaners.core import group_broken_paragraphs
 from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import (
+    combine_paragraphs_less_than_min,
+    partition_text,
+    split_content_to_fit_max,
+)
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 
@@ -18,6 +22,31 @@
     ListItem(text="I love fuzzy blankets"),
 ]
 
+MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
+ because it is just being used as an example. Hi. Hello. Howdy. Hola.
+ The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+    "\n",
+    "",
+)
+
+SHORT_PARAGRAPHS = """This is a story.
+
+This is a story that doesn't matter because it is just being used as an example.
+
+Hi.
+
+Hello.
+
+Howdy.
+
+Hola.
+
+The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
+
+End.
+"""
+
 
 @pytest.mark.parametrize(
     ("filename", "encoding"),
@@ -201,6 +230,79 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
     assert elements[-1].text.endswith("External links")
 
 
+def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
+    elements = partition_text(filename=filename)
+    elements_max_part = partition_text(filename=filename, max_partition=500)
+    assert len(elements) < len(elements_max_part)
+
+
+def test_partition_text_min_max():
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        min_partition=6,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that doesn't matter because it is just being used as an example.",
+        "Hi. Hello.",
+        "Howdy.",
+        """Hola. The example is simple and repetitive and long and somewhat boring,
+ but it serves a purpose. End.""".replace(
+            "\n",
+            "",
+        ),
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+    segments = partition_text(
+        text=SHORT_PARAGRAPHS,
+        max_partition=20,
+        min_partition=7,
+    )
+    expected = [
+        "This is a story.",
+        "This is a story that",
+        "doesn't matter",
+        "because it is just",
+        "being used as an",
+        "example.",
+        "Hi. Hello.",
+        "Howdy. Hola.",
+        "The example is",
+        "simple and",
+        "repetitive and long",
+        "and somewhat boring,",
+        "but it serves a",
+        "purpose. End.",
+    ]
+    for segment, test_segment in zip(segments, expected):
+        assert segment.text == test_segment
+
+
+def test_split_content_to_fit_max():
+    segments = split_content_to_fit_max(
+        content=MIN_MAX_TEXT,
+        max_partition=75,
+    )
+    assert segments == [
+        "This is a story.",
+        "This is a story that doesn't matter because",
+        "it is just being used as an example. Hi. Hello. Howdy. Hola.",
+        "The example is simple and repetitive and long",
+        "and somewhat boring, but it serves a purpose. End.",
+    ]
+
+
+def test_combine_paragraphs_less_than_min():
+    segments = combine_paragraphs_less_than_min(
+        SHORT_PARAGRAPHS.split("\n\n"),
+        max_partition=1500,
+        min_partition=7,
+    )
+    assert len(segments) < len(SHORT_PARAGRAPHS)
+
+
 def test_partition_text_doesnt_get_page_breaks():
     text = "--------------------"
     elements = partition_text(text=text)

diff --git a/unstructured/documents/xml.py b/unstructured/documents/xml.py
@@ -7,7 +7,7 @@
 from unstructured.logger import logger
 from unstructured.partition.text import (
     element_from_text,
-    split_by_paragraph,
+    partition_text,
 )
 
 VALID_PARSERS = Union[etree.HTMLParser, etree.XMLParser, None]
@@ -78,13 +78,16 @@ def _read_xml(self, content):
             #     Please use  bytes input or XML fragments without declaration.
             except ValueError:
                 document_tree = etree.fromstring(content.encode(), self.parser)
-
             if "<pre>" and "</pre>" in content:
                 tree = etree.HTML(content)
                 for element in tree.xpath("//pre"):
                     if not element.text:
                         continue
-                    text_content = split_by_paragraph(element.text)
+
+                    text_content = []
+                    for element in partition_text(text=element.text, paragraph_grouper=False):
+                        text_content.append(element.text)
+
                     for text in text_content:
                         element = etree.Element("span")
                         element.text = str(element_from_text(text=text))

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -52,7 +52,7 @@
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
 from unstructured.partition.html import partition_html
-from unstructured.partition.text import partition_text, split_by_paragraph
+from unstructured.partition.text import partition_text
 
 VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
 
@@ -232,6 +232,7 @@ def partition_email(
     metadata_filename: Optional[str] = None,
     process_attachments: bool = False,
     attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Partitions an .eml documents into its constituent elements.
@@ -258,6 +259,9 @@ def partition_email(
         processing the content of the email itself.
     attachment_partitioner
         The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing the text/plain content.
     """
     if content_source not in VALID_CONTENT_SOURCES:
         raise ValueError(
@@ -270,7 +274,6 @@ def partition_email(
 
     # Verify that only one of the arguments was provided
     exactly_one(filename=filename, file=file, text=text)
-
     detected_encoding = "utf-8"
     if filename is not None:
         extracted_encoding, msg = parse_email(filename=filename)
@@ -342,12 +345,12 @@ def partition_email(
                             continue
 
     elif content_source == "text/plain":
-        list_content = split_by_paragraph(content)
         elements = partition_text(
             text=content,
             encoding=encoding,
             max_partition=max_partition,
             metadata_filename=metadata_filename or filename,
+            min_partition=min_partition,
         )
 
     for idx, element in enumerate(elements):

diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
@@ -22,6 +22,7 @@ def partition_msg(
     metadata_filename: Optional[str] = None,
     process_attachments: bool = False,
     attachment_partitioner: Optional[Callable] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Partitions a MSFT Outlook .msg file
@@ -42,6 +43,9 @@ def partition_msg(
         processing the content of the email itself.
     attachment_partitioner
         The partitioning function to use to process attachments.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
     """
     exactly_one(filename=filename, file=file)
 
@@ -57,7 +61,11 @@ def partition_msg(
     if "<html>" in text or "</div>" in text:
         elements = partition_html(text=text)
     else:
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )
 
     metadata = build_msg_metadata(msg_obj, metadata_filename or filename)
     for element in elements:

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -51,6 +51,7 @@ def partition_pdf(
     max_partition: Optional[int] = 1500,
     include_metadata: bool = True,
     metadata_filename: Optional[str] = None,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf document into a list of interpreted elements.
@@ -81,6 +82,9 @@ def partition_pdf(
     max_partition
         The maximum number of characters to include in a partition. If None is passed,
         no maximum is applied. Only applies to the "ocr_only" strategy.
+    min_partition
+        The minimum number of characters to include in a partition. Only applies if
+        processing text/plain content.
     """
     exactly_one(filename=filename, file=file)
     return partition_pdf_or_image(
@@ -91,6 +95,7 @@ def partition_pdf(
         infer_table_structure=infer_table_structure,
         ocr_languages=ocr_languages,
         max_partition=max_partition,
+        min_partition=min_partition,
         **kwargs,
     )
 
@@ -116,6 +121,7 @@ def partition_pdf_or_image(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
     **kwargs,
 ) -> List[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
@@ -172,6 +178,7 @@ def partition_pdf_or_image(
                 ocr_languages=ocr_languages,
                 is_image=is_image,
                 max_partition=max_partition,
+                min_partition=min_partition,
             )
 
     return layout_elements
@@ -391,6 +398,7 @@ def _partition_pdf_or_image_with_ocr(
     ocr_languages: str = "eng",
     is_image: bool = False,
     max_partition: Optional[int] = 1500,
+    min_partition: Optional[int] = 0,
 ):
     """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
     to an image prior to processing."""
@@ -402,7 +410,11 @@ def _partition_pdf_or_image_with_ocr(
             text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
         else:
             text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
-        elements = partition_text(text=text, max_partition=max_partition)
+        elements = partition_text(
+            text=text,
+            max_partition=max_partition,
+            min_partition=min_partition,
+        )
     else:
         elements = []
         page_number = 0
@@ -411,7 +423,11 @@ def _partition_pdf_or_image_with_ocr(
             metadata = ElementMetadata(filename=filename, page_number=page_number)
             text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
 
-            _elements = partition_text(text=text, max_partition=max_partition)
+            _elements = partition_text(
+                text=text,
+                max_partition=max_partition,
+                min_partition=min_partition,
+            )
             for element in _elements:
                 element.metadata = metadata
                 elements.append(element)