From ab7e734f94b8e6b25360f2c816191b50f301664e Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Sun, 4 Aug 2024 14:33:51 -0700 Subject: [PATCH] rfctr(csv): minify HTML and table text is cct --- CHANGELOG.md | 3 +- test_unstructured/partition/test_constants.py | 126 ++++-------------- test_unstructured/partition/test_csv.py | 5 +- unstructured/__version__.py | 2 +- unstructured/partition/csv.py | 12 +- 5 files changed, 39 insertions(+), 109 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 020619e584..664879701f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.1-dev2 +## 0.16.1-dev3 ### Enhancements @@ -11,6 +11,7 @@ * **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. * **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file. * **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. +* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. ## 0.16.0 diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py index 8b003cd9c5..a7b0ddce81 100644 --- a/test_unstructured/partition/test_constants.py +++ b/test_unstructured/partition/test_constants.py @@ -1,32 +1,32 @@ -EXPECTED_TABLE = """ - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stanley Cups
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" +EXPECTED_TABLE = ( + "" + "" + "" + "" + "" + "" + "
Stanley Cups
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" +) + +EXPECTED_TABLE_SEMICOLON_DELIMITER = ( + "" + "" + "" + "" + "" + "
YearMonthRevenueCosts
20221123-123
20232143,1-814,38
20243215,32-11,08
" +) + +EXPECTED_TABLE_WITH_EMOJI = ( + "" + "" + "" + "" + "" + "" + "" + "
Stanley Cups
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
👨\\U+1F3FB🔧TOR15
" +) EXPECTED_TABLE_XLSX = ( "" @@ -54,74 +54,6 @@ "Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08" ) -EXPECTED_TABLE_SEMICOLON_DELIMITER = """
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
YearMonthRevenueCosts
20221123-123
20232143,1-814,38
20243215,32-11,08
""" - -EXPECTED_TABLE_WITH_EMOJI = """ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Stanley Cups
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
👨\\U+1F3FB🔧TOR15
""" - EXPECTED_XLS_TABLE = ( "" "" diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py index 9317bec8cb..e3c34edbf5 100644 --- a/test_unstructured/partition/test_csv.py +++ b/test_unstructured/partition/test_csv.py @@ -200,11 +200,8 @@ def test_partition_csv_header(): ) table = elements[0] - assert clean_extra_whitespace(table.text) == ( - "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX - ) + assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX assert table.metadata.text_as_html is not None - assert "" in table.metadata.text_as_html # ================================================================================================ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a2a7ffee60..2a0c1ed30b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev2" # pragma: no cover +__version__ = "0.16.1-dev3" # pragma: no cover diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 6304712904..6006944c84 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -5,9 +5,9 @@ from typing import IO, Any, Iterator import pandas as pd -from lxml.html.soupparser import fromstring as soupparser_fromstring from unstructured.chunking import add_chunking_strategy +from unstructured.common.html_table import HtmlTable from unstructured.documents.elements import Element, ElementMetadata, Table from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date @@ -46,7 +46,6 @@ def partition_csv( Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure). """ - ctx = _CsvPartitioningContext.load( file_path=filename, file=file, @@ -58,17 +57,18 @@ def partition_csv( with ctx.open() as file: dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding) - html_text = dataframe.to_html(index=False, header=include_header, na_rep="") - text = soupparser_fromstring(html_text).text_content() + html_table = HtmlTable.from_html_text( + dataframe.to_html(index=False, header=include_header, na_rep="") + ) metadata = ElementMetadata( filename=filename, last_modified=ctx.last_modified, - text_as_html=html_text if infer_table_structure else None, + text_as_html=html_table.html if infer_table_structure else None, ) # -- a CSV file becomes a single `Table` element -- - return [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)] + return [Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)] class _CsvPartitioningContext:
MC