diff --git a/CHANGELOG.md b/CHANGELOG.md index 664879701f..19524fea38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.1-dev3 +## 0.16.1-dev4 ### Enhancements @@ -8,10 +8,11 @@ * **Remove unsupported chipper model** * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process. -* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. +* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text. * **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file. -* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. -* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. +* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text. +* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text. +* **Minify text_as_html from PPTX.** Previously `.metadata.text_as_html` for PPTX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count while preserving all text and structure. ## 0.16.0 diff --git a/requirements/base.in b/requirements/base.in index 6f3be98c91..f049ae493d 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -4,7 +4,6 @@ filetype python-magic lxml nltk -tabulate requests beautifulsoup4 emoji diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py index 8981ca19e1..441f0c51f9 100644 --- a/test_unstructured/partition/common/test_common.py +++ b/test_unstructured/partition/common/test_common.py @@ -388,17 +388,6 @@ def test_convert_office_docs_respects_wait_timeout(): assert np.sum([(path / "simple.docx").is_file() for path in paths_to_save]) < 3 -class MockDocxEmptyTable: - def __init__(self): - self.rows = [] - - -def test_convert_ms_office_table_to_text_works_with_empty_tables(): - table = MockDocxEmptyTable() - assert common.convert_ms_office_table_to_text(table, as_html=True) == "" - assert common.convert_ms_office_table_to_text(table, as_html=False) == "" - - @pytest.mark.parametrize( ("text", "expected"), [ diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index 43238a4136..e19d87ff98 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -247,15 +247,11 @@ def test_partition_pptx_grabs_tables(): assert elements[1].text.startswith("Column 1") assert elements[1].text.strip().endswith("Aqua") assert elements[1].metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" + "
Column 1 Column 2 Column 3
Red Green Blue
Purple Orange Yellow
Tangerine Pink Aqua
" + "" + "" + "" + "" "
Column 1Column 2Column 3
RedGreenBlue
PurpleOrangeYellow
TangerinePinkAqua
" ) assert elements[1].metadata.filename == "fake-power-point-table.pptx" @@ -516,7 +512,7 @@ def test_partition_pptx_hierarchy_sample_document(): (2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"), (0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"), (None, None, "4120066d251ba675ade42e8a167ca61f"), - (None, None, "2ed3bd10daace79ac129cbf8faf22bfc"), + (None, None, "efb9d74b4f8be6308c9a9006da994e12"), (0, None, "fd08cacbaddafee5cbacc02528536ee5"), ] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2a0c1ed30b..46eb7507b5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev3" # pragma: no cover +__version__ = "0.16.1-dev4" # pragma: no cover diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py index ac83b1be7a..2605acb97e 100644 --- a/unstructured/partition/common/common.py +++ b/unstructured/partition/common/common.py @@ -9,7 +9,6 @@ import emoji import psutil -from tabulate import tabulate from unstructured.documents.coordinates import CoordinateSystem, PixelSpace from unstructured.documents.elements import ( @@ -29,9 +28,6 @@ from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT from unstructured.utils import dependency_exists, first -if dependency_exists("pptx") and dependency_exists("pptx.table"): - from pptx.table import Table as PptxTable - if dependency_exists("numpy") and dependency_exists("cv2"): from unstructured.partition.utils.sorting import sort_page_elements @@ -396,27 +392,6 @@ def convert_to_bytes(file: bytes | IO[bytes]) -> bytes: raise ValueError("Invalid file-like object type") -def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str: - """Convert a PPTX table object to an HTML table string using the tabulate library. - - Args: - table (Table): A pptx.table.Table object. - as_html (bool): Whether to return the table as an HTML string (True) or a - plain text string (False) - - Returns: - str: An table string representation of the input table. - """ - rows = list(table.rows) - - if not rows: - return "" - - headers = [cell.text for cell in rows[0].cells] - data = [[cell.text for cell in row.cells] for row in rows[1:]] - return tabulate(data, headers=headers, tablefmt="html" if as_html else "plain") - - def contains_emoji(s: str) -> bool: """ Check if the input string contains any emoji characters. diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 461c208366..0fc46c773e 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -22,6 +22,7 @@ from pptx.text.text import _Paragraph # pyright: ignore [reportPrivateUsage] from unstructured.chunking import add_chunking_strategy +from unstructured.common.html_table import HtmlTable, htmlify_matrix_of_cell_texts from unstructured.documents.elements import ( Element, ElementMetadata, @@ -34,7 +35,6 @@ Title, ) from unstructured.file_utils.model import FileType -from unstructured.partition.common.common import convert_ms_office_table_to_text from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.partition.text_type import ( is_email_address, @@ -213,38 +213,6 @@ def _iter_picture_elements(self, picture: Picture) -> Iterator[Element]: PicturePartitionerCls = self._opts.picture_partitioner yield from PicturePartitionerCls.iter_elements(picture, self._opts) - def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]: - """Generate Title element for each paragraph in title `shape`. - - Text is most likely a title, but in the rare case that the title shape was used - for the slide body text, also check for bulleted paragraphs.""" - if self._shape_is_off_slide(shape): - return - - depth = 0 - for paragraph in shape.text_frame.paragraphs: - text = paragraph.text - if text.strip() == "": - continue - - if self._is_bulleted_paragraph(paragraph): - bullet_depth = paragraph.level or 0 - yield ListItem( - text=text, - metadata=self._opts.text_metadata(category_depth=bullet_depth), - detection_origin=DETECTION_ORIGIN, - ) - elif is_email_address(text): - yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN) - else: - # increment the category depth by the paragraph increment in the shape - yield Title( - text=text, - metadata=self._opts.text_metadata(category_depth=depth), - detection_origin=DETECTION_ORIGIN, - ) - depth += 1 # Cannot enumerate because we want to skip empty paragraphs - def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]: """Generate Text or subtype element for each paragraph in `shape`.""" if self._shape_is_off_slide(shape): @@ -280,18 +248,55 @@ def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]: An empty table does not produce an element. """ - text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip() - if not text_table: + if not (rows := list(graphfrm.table.rows)): + return + + html_text = htmlify_matrix_of_cell_texts( + [[cell.text for cell in row.cells] for row in rows] + ) + html_table = HtmlTable.from_html_text(html_text) + + if not html_table.text: return - html_table = None - if self._opts.infer_table_structure: - html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True) - yield Table( - text=text_table, - metadata=self._opts.table_metadata(html_table), - detection_origin=DETECTION_ORIGIN, + + metadata = self._opts.table_metadata( + html_table.html if self._opts.infer_table_structure else None ) + yield Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN) + + def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]: + """Generate Title element for each paragraph in title `shape`. + + Text is most likely a title, but in the rare case that the title shape was used + for the slide body text, also check for bulleted paragraphs.""" + if self._shape_is_off_slide(shape): + return + + depth = 0 + for paragraph in shape.text_frame.paragraphs: + text = paragraph.text + if text.strip() == "": + continue + + if self._is_bulleted_paragraph(paragraph): + bullet_depth = paragraph.level or 0 + yield ListItem( + text=text, + metadata=self._opts.text_metadata(category_depth=bullet_depth), + detection_origin=DETECTION_ORIGIN, + ) + elif is_email_address(text): + yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN) + else: + # increment the category depth by the paragraph increment in the shape + yield Title( + text=text, + metadata=self._opts.text_metadata(category_depth=depth), + detection_origin=DETECTION_ORIGIN, + ) + depth += 1 # Cannot enumerate because we want to skip empty paragraphs + def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]: """Orders the shapes on `slide` from top to bottom and left to right.