Skip to content

Commit

Permalink
enhancement: memory efficient xml partitioning (#1547)
Browse files Browse the repository at this point in the history
Closes #1236. Partitions XML documents iteratively in most cases*, never
loading the entire tree into memory. This ends up being much faster.

(* The exception is when the argument `xml_path` is passed to filter
elements. I was not able to find a way in Python to compare XPaths while
streaming the elements, aside from writing a custom XPath parser. So the
shortest way forward was to bite the bullet and load the whole tree in
memory when filtering by XPath.)

Memory usage is about 20% of usage on `main` when processing a 470MB XML
file. Time to process is 10s vs 900s.

Output is slightly different, but appears to be an improvement, adding
lines of text that are skipped in current partitioning. No text is lost.
  • Loading branch information
qued authored Sep 28, 2023
1 parent 62b0557 commit e5d0866
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 32 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.10.17-dev15
## 0.10.17-dev16

### Enhancements

* **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases.
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
* **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
* **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index.
Expand Down
6 changes: 3 additions & 3 deletions test_unstructured/partition/test_xml_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_partition_xml_from_filename_with_metadata_filename():
)
def test_partition_xml_from_file(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)

assert elements[0].text == "United States"
Expand All @@ -47,7 +47,7 @@ def test_partition_xml_from_file(filename):

def test_partition_xml_from_file_with_metadata_filename():
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
with open(file_path) as f:
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")

assert elements[0].text == "United States"
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_partition_xml_from_filename_exclude_metadata(filename):
)
def test_partition_xml_from_file_exclude_metadata(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
with open(file_path, "rb") as f:
elements = partition_xml(
file=f,
xml_keep_tags=False,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.17-dev15" # pragma: no cover
__version__ = "0.10.17-dev16" # pragma: no cover
72 changes: 45 additions & 27 deletions unstructured/partition/xml.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import xml.etree.ElementTree as ET
from io import BytesIO
from tempfile import SpooledTemporaryFile
from typing import IO, BinaryIO, List, Optional, Union, cast
from typing import IO, BinaryIO, Iterator, List, Optional, Union, cast

from lxml import etree

from unstructured.chunking.title import add_chunking_strategy
from unstructured.documents.elements import (
Expand All @@ -20,41 +22,57 @@
from unstructured.partition.text import element_from_text


def is_leaf(elem):
return not bool(elem)


def is_string(elem):
return isinstance(elem, str) or (hasattr(elem, "text") and isinstance(elem.text, str))


def get_leaf_elements(
filename: Optional[str] = None,
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
text: Optional[str] = None,
xml_path: str = ".",
xml_keep_tags: bool = False,
) -> List[Optional[str]]:
xml_path: Optional[str] = None,
) -> Iterator[Optional[str]]:
"""Get leaf elements from the XML tree defined in filename, file, or text."""
exactly_one(filename=filename, file=file, text=text)
if filename:
_, raw_text = read_txt_file(filename=filename)
return _get_leaf_elements(filename, xml_path=xml_path)
elif file:
f = spooled_to_bytes_io_if_needed(
cast(Union[BinaryIO, SpooledTemporaryFile], file),
f = cast(
IO[bytes],
spooled_to_bytes_io_if_needed(
cast(Union[BinaryIO, SpooledTemporaryFile], file),
),
)
_, raw_text = read_txt_file(file=f)
elif text:
raw_text = text
return _get_leaf_elements(f, xml_path=xml_path)
else:
b = BytesIO(bytes(cast(str, text), encoding="utf-8"))
return _get_leaf_elements(b, xml_path=xml_path)


def _get_leaf_elements(
file: Union[str, IO[bytes]],
xml_path: Optional[str] = None,
) -> Iterator[Optional[str]]:
"""Parse the XML tree in a memory efficient manner if possible."""
element_stack = []

element_iterator = etree.iterparse(file, events=("start", "end"))
# NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream
# elements through in a memory efficient way, so we bite the bullet and load it all into
# memory.
if xml_path is not None:
_, element = next(element_iterator)
compiled_path = etree.XPath(xml_path)
element_iterator = (("end", el) for el in compiled_path(element))

for event, element in element_iterator:
if event == "start":
element_stack.append(element)

root = ET.fromstring(raw_text)
leaf_elements = []
if event == "end":
if element.text is not None and element.text.strip():
yield element.text

for elem in root.findall(xml_path):
for subelem in elem.iter():
if is_leaf(subelem) and is_string(subelem.text):
leaf_elements.append(subelem.text)
element.clear()

return leaf_elements
while element_stack and element_stack[-1].getparent() is None:
element_stack.pop()


@process_metadata()
Expand All @@ -65,7 +83,7 @@ def partition_xml(
file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
text: Optional[str] = None,
xml_keep_tags: bool = False,
xml_path: str = ".",
xml_path: Optional[str] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
encoding: Optional[str] = None,
Expand Down

0 comments on commit e5d0866

Please sign in to comment.