enhancement: memory efficient xml partitioning (#1547)

Closes #1236. Partitions XML documents iteratively in most cases*, never loading the entire tree into memory. This ends up being much faster. (* The exception is when the argument `xml_path` is passed to filter elements. I was not able to find a way in Python to compare XPaths while streaming the elements, aside from writing a custom XPath parser. So the shortest way forward was to bite the bullet and load the whole tree in memory when filtering by XPath.) Memory usage is about 20% of usage on `main` when processing a 470MB XML file. Time to process is 10s vs 900s. Output is slightly different, but appears to be an improvement, adding lines of text that are skipped in current partitioning. No text is lost.
Unstructured-IO · Sep 28, 2023 · e5d0866 · e5d0866
1 parent 62b0557
commit e5d0866
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.17-dev15
+## 0.10.17-dev16
 
 ### Enhancements
 
+* **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases.
 * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
 * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
 * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI.  Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index.

diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py
@@ -38,7 +38,7 @@ def test_partition_xml_from_filename_with_metadata_filename():
 )
 def test_partition_xml_from_file(filename):
     file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
-    with open(file_path) as f:
+    with open(file_path, "rb") as f:
         elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
 
     assert elements[0].text == "United States"
@@ -47,7 +47,7 @@ def test_partition_xml_from_file(filename):
 
 def test_partition_xml_from_file_with_metadata_filename():
     file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
-    with open(file_path) as f:
+    with open(file_path, "rb") as f:
         elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")
 
     assert elements[0].text == "United States"
@@ -158,7 +158,7 @@ def test_partition_xml_from_filename_exclude_metadata(filename):
 )
 def test_partition_xml_from_file_exclude_metadata(filename):
     file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
-    with open(file_path) as f:
+    with open(file_path, "rb") as f:
         elements = partition_xml(
             file=f,
             xml_keep_tags=False,

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.17-dev15"  # pragma: no cover
+__version__ = "0.10.17-dev16"  # pragma: no cover
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
@@ -1,6 +1,8 @@
-import xml.etree.ElementTree as ET
+from io import BytesIO
 from tempfile import SpooledTemporaryFile
-from typing import IO, BinaryIO, List, Optional, Union, cast
+from typing import IO, BinaryIO, Iterator, List, Optional, Union, cast
+
+from lxml import etree
 
 from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import (
@@ -20,41 +22,57 @@
 from unstructured.partition.text import element_from_text
 
 
-def is_leaf(elem):
-    return not bool(elem)
-
-
-def is_string(elem):
-    return isinstance(elem, str) or (hasattr(elem, "text") and isinstance(elem.text, str))
-
-
 def get_leaf_elements(
     filename: Optional[str] = None,
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
     text: Optional[str] = None,
-    xml_path: str = ".",
-    xml_keep_tags: bool = False,
-) -> List[Optional[str]]:
+    xml_path: Optional[str] = None,
+) -> Iterator[Optional[str]]:
+    """Get leaf elements from the XML tree defined in filename, file, or text."""
     exactly_one(filename=filename, file=file, text=text)
     if filename:
-        _, raw_text = read_txt_file(filename=filename)
+        return _get_leaf_elements(filename, xml_path=xml_path)
     elif file:
-        f = spooled_to_bytes_io_if_needed(
-            cast(Union[BinaryIO, SpooledTemporaryFile], file),
+        f = cast(
+            IO[bytes],
+            spooled_to_bytes_io_if_needed(
+                cast(Union[BinaryIO, SpooledTemporaryFile], file),
+            ),
         )
-        _, raw_text = read_txt_file(file=f)
-    elif text:
-        raw_text = text
+        return _get_leaf_elements(f, xml_path=xml_path)
+    else:
+        b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
+        return _get_leaf_elements(b, xml_path=xml_path)
+
+
+def _get_leaf_elements(
+    file: Union[str, IO[bytes]],
+    xml_path: Optional[str] = None,
+) -> Iterator[Optional[str]]:
+    """Parse the XML tree in a memory efficient manner if possible."""
+    element_stack = []
+
+    element_iterator = etree.iterparse(file, events=("start", "end"))
+    # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
+    # elements through in a memory efficient way, so we bite the bullet and load it all into
+    # memory.
+    if xml_path is not None:
+        _, element = next(element_iterator)
+        compiled_path = etree.XPath(xml_path)
+        element_iterator = (("end", el) for el in compiled_path(element))
+
+    for event, element in element_iterator:
+        if event == "start":
+            element_stack.append(element)
 
-    root = ET.fromstring(raw_text)
-    leaf_elements = []
+        if event == "end":
+            if element.text is not None and element.text.strip():
+                yield element.text
 
-    for elem in root.findall(xml_path):
-        for subelem in elem.iter():
-            if is_leaf(subelem) and is_string(subelem.text):
-                leaf_elements.append(subelem.text)
+            element.clear()
 
-    return leaf_elements
+        while element_stack and element_stack[-1].getparent() is None:
+            element_stack.pop()
 
 
 @process_metadata()
@@ -65,7 +83,7 @@ def partition_xml(
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
     text: Optional[str] = None,
     xml_keep_tags: bool = False,
-    xml_path: str = ".",
+    xml_path: Optional[str] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
     encoding: Optional[str] = None,