diff --git a/CHANGELOG.md b/CHANGELOG.md
index 664879701f..19524fea38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.1-dev3
+## 0.16.1-dev4
### Enhancements
@@ -8,10 +8,11 @@
* **Remove unsupported chipper model**
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
-* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
+* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
-* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
-* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
+* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
+* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text.
+* **Minify text_as_html from PPTX.** Previously `.metadata.text_as_html` for PPTX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text and structure.
## 0.16.0
diff --git a/requirements/base.in b/requirements/base.in
index 6f3be98c91..f049ae493d 100644
--- a/requirements/base.in
+++ b/requirements/base.in
@@ -4,7 +4,6 @@ filetype
python-magic
lxml
nltk
-tabulate
requests
beautifulsoup4
emoji
diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py
index 8981ca19e1..441f0c51f9 100644
--- a/test_unstructured/partition/common/test_common.py
+++ b/test_unstructured/partition/common/test_common.py
@@ -388,17 +388,6 @@ def test_convert_office_docs_respects_wait_timeout():
assert np.sum([(path / "simple.docx").is_file() for path in paths_to_save]) < 3
-class MockDocxEmptyTable:
- def __init__(self):
- self.rows = []
-
-
-def test_convert_ms_office_table_to_text_works_with_empty_tables():
- table = MockDocxEmptyTable()
- assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
- assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
-
-
@pytest.mark.parametrize(
("text", "expected"),
[
diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py
index 43238a4136..e19d87ff98 100644
--- a/test_unstructured/partition/test_pptx.py
+++ b/test_unstructured/partition/test_pptx.py
@@ -247,15 +247,11 @@ def test_partition_pptx_grabs_tables():
assert elements[1].text.startswith("Column 1")
assert elements[1].text.strip().endswith("Aqua")
assert elements[1].metadata.text_as_html == (
- "
\n"
- "\n"
- "Column 1 | Column 2 | Column 3 |
\n"
- "\n"
- "\n"
- "Red | Green | Blue |
\n"
- "Purple | Orange | Yellow |
\n"
- "Tangerine | Pink | Aqua |
\n"
- "\n"
+ ""
+ "Column 1 | Column 2 | Column 3 |
"
+ "Red | Green | Blue |
"
+ "Purple | Orange | Yellow |
"
+ "Tangerine | Pink | Aqua |
"
"
"
)
assert elements[1].metadata.filename == "fake-power-point-table.pptx"
@@ -516,7 +512,7 @@ def test_partition_pptx_hierarchy_sample_document():
(2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"),
(0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"),
(None, None, "4120066d251ba675ade42e8a167ca61f"),
- (None, None, "2ed3bd10daace79ac129cbf8faf22bfc"),
+ (None, None, "efb9d74b4f8be6308c9a9006da994e12"),
(0, None, "fd08cacbaddafee5cbacc02528536ee5"),
]
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 2a0c1ed30b..46eb7507b5 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.1-dev3" # pragma: no cover
+__version__ = "0.16.1-dev4" # pragma: no cover
diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py
index ac83b1be7a..2605acb97e 100644
--- a/unstructured/partition/common/common.py
+++ b/unstructured/partition/common/common.py
@@ -9,7 +9,6 @@
import emoji
import psutil
-from tabulate import tabulate
from unstructured.documents.coordinates import CoordinateSystem, PixelSpace
from unstructured.documents.elements import (
@@ -29,9 +28,6 @@
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.utils import dependency_exists, first
-if dependency_exists("pptx") and dependency_exists("pptx.table"):
- from pptx.table import Table as PptxTable
-
if dependency_exists("numpy") and dependency_exists("cv2"):
from unstructured.partition.utils.sorting import sort_page_elements
@@ -396,27 +392,6 @@ def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
raise ValueError("Invalid file-like object type")
-def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str:
- """Convert a PPTX table object to an HTML table string using the tabulate library.
-
- Args:
- table (Table): A pptx.table.Table object.
- as_html (bool): Whether to return the table as an HTML string (True) or a
- plain text string (False)
-
- Returns:
- str: An table string representation of the input table.
- """
- rows = list(table.rows)
-
- if not rows:
- return ""
-
- headers = [cell.text for cell in rows[0].cells]
- data = [[cell.text for cell in row.cells] for row in rows[1:]]
- return tabulate(data, headers=headers, tablefmt="html" if as_html else "plain")
-
-
def contains_emoji(s: str) -> bool:
"""
Check if the input string contains any emoji characters.
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
index 461c208366..0fc46c773e 100644
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@@ -22,6 +22,7 @@
from pptx.text.text import _Paragraph # pyright: ignore [reportPrivateUsage]
from unstructured.chunking import add_chunking_strategy
+from unstructured.common.html_table import HtmlTable, htmlify_matrix_of_cell_texts
from unstructured.documents.elements import (
Element,
ElementMetadata,
@@ -34,7 +35,6 @@
Title,
)
from unstructured.file_utils.model import FileType
-from unstructured.partition.common.common import convert_ms_office_table_to_text
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text_type import (
is_email_address,
@@ -213,38 +213,6 @@ def _iter_picture_elements(self, picture: Picture) -> Iterator[Element]:
PicturePartitionerCls = self._opts.picture_partitioner
yield from PicturePartitionerCls.iter_elements(picture, self._opts)
- def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
- """Generate Title element for each paragraph in title `shape`.
-
- Text is most likely a title, but in the rare case that the title shape was used
- for the slide body text, also check for bulleted paragraphs."""
- if self._shape_is_off_slide(shape):
- return
-
- depth = 0
- for paragraph in shape.text_frame.paragraphs:
- text = paragraph.text
- if text.strip() == "":
- continue
-
- if self._is_bulleted_paragraph(paragraph):
- bullet_depth = paragraph.level or 0
- yield ListItem(
- text=text,
- metadata=self._opts.text_metadata(category_depth=bullet_depth),
- detection_origin=DETECTION_ORIGIN,
- )
- elif is_email_address(text):
- yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN)
- else:
- # increment the category depth by the paragraph increment in the shape
- yield Title(
- text=text,
- metadata=self._opts.text_metadata(category_depth=depth),
- detection_origin=DETECTION_ORIGIN,
- )
- depth += 1 # Cannot enumerate because we want to skip empty paragraphs
-
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
"""Generate Text or subtype element for each paragraph in `shape`."""
if self._shape_is_off_slide(shape):
@@ -280,18 +248,55 @@ def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]:
An empty table does not produce an element.
"""
- text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip()
- if not text_table:
+ if not (rows := list(graphfrm.table.rows)):
+ return
+
+ html_text = htmlify_matrix_of_cell_texts(
+ [[cell.text for cell in row.cells] for row in rows]
+ )
+ html_table = HtmlTable.from_html_text(html_text)
+
+ if not html_table.text:
return
- html_table = None
- if self._opts.infer_table_structure:
- html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True)
- yield Table(
- text=text_table,
- metadata=self._opts.table_metadata(html_table),
- detection_origin=DETECTION_ORIGIN,
+
+ metadata = self._opts.table_metadata(
+ html_table.html if self._opts.infer_table_structure else None
)
+ yield Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
+
+ def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
+ """Generate Title element for each paragraph in title `shape`.
+
+ Text is most likely a title, but in the rare case that the title shape was used
+ for the slide body text, also check for bulleted paragraphs."""
+ if self._shape_is_off_slide(shape):
+ return
+
+ depth = 0
+ for paragraph in shape.text_frame.paragraphs:
+ text = paragraph.text
+ if text.strip() == "":
+ continue
+
+ if self._is_bulleted_paragraph(paragraph):
+ bullet_depth = paragraph.level or 0
+ yield ListItem(
+ text=text,
+ metadata=self._opts.text_metadata(category_depth=bullet_depth),
+ detection_origin=DETECTION_ORIGIN,
+ )
+ elif is_email_address(text):
+ yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN)
+ else:
+ # increment the category depth by the paragraph increment in the shape
+ yield Title(
+ text=text,
+ metadata=self._opts.text_metadata(category_depth=depth),
+ detection_origin=DETECTION_ORIGIN,
+ )
+ depth += 1 # Cannot enumerate because we want to skip empty paragraphs
+
def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]:
"""Orders the shapes on `slide` from top to bottom and left to right.