diff --git a/CHANGELOG.md b/CHANGELOG.md index 98311ce131..8367477168 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.10.17-dev15 +## 0.10.17-dev16 ### Enhancements +* **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases. * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index. diff --git a/test_unstructured/partition/test_xml_partition.py b/test_unstructured/partition/test_xml_partition.py index cebff68b64..a301a46870 100644 --- a/test_unstructured/partition/test_xml_partition.py +++ b/test_unstructured/partition/test_xml_partition.py @@ -38,7 +38,7 @@ def test_partition_xml_from_filename_with_metadata_filename(): ) def test_partition_xml_from_file(filename): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path) assert elements[0].text == "United States" @@ -47,7 +47,7 @@ def test_partition_xml_from_file(filename): def test_partition_xml_from_file_with_metadata_filename(): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml") - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test") assert elements[0].text == "United States" @@ -158,7 +158,7 @@ def test_partition_xml_from_filename_exclude_metadata(filename): ) def test_partition_xml_from_file_exclude_metadata(filename): file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename) - with open(file_path) as f: + with open(file_path, "rb") as f: elements = partition_xml( file=f, xml_keep_tags=False, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 01785404e1..c555eeb001 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev15" # pragma: no cover +__version__ = "0.10.17-dev16" # pragma: no cover diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index ff733300c5..f8bdc48211 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -1,6 +1,8 @@ -import xml.etree.ElementTree as ET +from io import BytesIO from tempfile import SpooledTemporaryFile -from typing import IO, BinaryIO, List, Optional, Union, cast +from typing import IO, BinaryIO, Iterator, List, Optional, Union, cast + +from lxml import etree from unstructured.chunking.title import add_chunking_strategy from unstructured.documents.elements import ( @@ -20,41 +22,57 @@ from unstructured.partition.text import element_from_text -def is_leaf(elem): - return not bool(elem) - - -def is_string(elem): - return isinstance(elem, str) or (hasattr(elem, "text") and isinstance(elem.text, str)) - - def get_leaf_elements( filename: Optional[str] = None, file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, text: Optional[str] = None, - xml_path: str = ".", - xml_keep_tags: bool = False, -) -> List[Optional[str]]: + xml_path: Optional[str] = None, +) -> Iterator[Optional[str]]: + """Get leaf elements from the XML tree defined in filename, file, or text.""" exactly_one(filename=filename, file=file, text=text) if filename: - _, raw_text = read_txt_file(filename=filename) + return _get_leaf_elements(filename, xml_path=xml_path) elif file: - f = spooled_to_bytes_io_if_needed( - cast(Union[BinaryIO, SpooledTemporaryFile], file), + f = cast( + IO[bytes], + spooled_to_bytes_io_if_needed( + cast(Union[BinaryIO, SpooledTemporaryFile], file), + ), ) - _, raw_text = read_txt_file(file=f) - elif text: - raw_text = text + return _get_leaf_elements(f, xml_path=xml_path) + else: + b = BytesIO(bytes(cast(str, text), encoding="utf-8")) + return _get_leaf_elements(b, xml_path=xml_path) + + +def _get_leaf_elements( + file: Union[str, IO[bytes]], + xml_path: Optional[str] = None, +) -> Iterator[Optional[str]]: + """Parse the XML tree in a memory efficient manner if possible.""" + element_stack = [] + + element_iterator = etree.iterparse(file, events=("start", "end")) + # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream + # elements through in a memory efficient way, so we bite the bullet and load it all into + # memory. + if xml_path is not None: + _, element = next(element_iterator) + compiled_path = etree.XPath(xml_path) + element_iterator = (("end", el) for el in compiled_path(element)) + + for event, element in element_iterator: + if event == "start": + element_stack.append(element) - root = ET.fromstring(raw_text) - leaf_elements = [] + if event == "end": + if element.text is not None and element.text.strip(): + yield element.text - for elem in root.findall(xml_path): - for subelem in elem.iter(): - if is_leaf(subelem) and is_string(subelem.text): - leaf_elements.append(subelem.text) + element.clear() - return leaf_elements + while element_stack and element_stack[-1].getparent() is None: + element_stack.pop() @process_metadata() @@ -65,7 +83,7 @@ def partition_xml( file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None, text: Optional[str] = None, xml_keep_tags: bool = False, - xml_path: str = ".", + xml_path: Optional[str] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, encoding: Optional[str] = None,