diff --git a/CHANGELOG.md b/CHANGELOG.md
index 020619e584..664879701f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.1-dev2
+## 0.16.1-dev3
### Enhancements
@@ -11,6 +11,7 @@
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
+* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
## 0.16.0
diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py
index 8b003cd9c5..a7b0ddce81 100644
--- a/test_unstructured/partition/test_constants.py
+++ b/test_unstructured/partition/test_constants.py
@@ -1,32 +1,32 @@
-EXPECTED_TABLE = """
-
-
- Stanley Cups |
- |
- |
-
-
- Team |
- Location |
- Stanley Cups |
-
-
- Blues |
- STL |
- 1 |
-
-
- Flyers |
- PHI |
- 2 |
-
-
- Maple Leafs |
- TOR |
- 13 |
-
-
-
"""
+EXPECTED_TABLE = (
+ ""
+ "Stanley Cups | | |
"
+ "Team | Location | Stanley Cups |
"
+ "Blues | STL | 1 |
"
+ "Flyers | PHI | 2 |
"
+ "Maple Leafs | TOR | 13 |
"
+ "
"
+)
+
+EXPECTED_TABLE_SEMICOLON_DELIMITER = (
+ ""
+ "Year | Month | Revenue | Costs | |
"
+ "2022 | 1 | 123 | -123 | |
"
+ "2023 | 2 | 143,1 | -814,38 | |
"
+ "2024 | 3 | 215,32 | -11,08 | |
"
+ "
"
+)
+
+EXPECTED_TABLE_WITH_EMOJI = (
+ ""
+ "Stanley Cups | | |
"
+ "Team | Location | Stanley Cups |
"
+ "Blues | STL | 1 |
"
+ "Flyers | PHI | 2 |
"
+ "Maple Leafs | TOR | 13 |
"
+ "👨\\U+1F3FB🔧 | TOR | 15 |
"
+ "
"
+)
EXPECTED_TABLE_XLSX = (
""
@@ -54,74 +54,6 @@
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)
-EXPECTED_TABLE_SEMICOLON_DELIMITER = """
-
-
- Year |
- Month |
- Revenue |
- Costs |
- |
-
-
- 2022 |
- 1 |
- 123 |
- -123 |
- |
-
-
- 2023 |
- 2 |
- 143,1 |
- -814,38 |
- |
-
-
- 2024 |
- 3 |
- 215,32 |
- -11,08 |
- |
-
-
-
"""
-
-EXPECTED_TABLE_WITH_EMOJI = """
-
-
- Stanley Cups |
- |
- |
-
-
- Team |
- Location |
- Stanley Cups |
-
-
- Blues |
- STL |
- 1 |
-
-
- Flyers |
- PHI |
- 2 |
-
-
- Maple Leafs |
- TOR |
- 13 |
-
-
- 👨\\U+1F3FB🔧 |
- TOR |
- 15 |
-
-
-
"""
-
EXPECTED_XLS_TABLE = (
""
"MC | "
diff --git a/test_unstructured/partition/test_csv.py b/test_unstructured/partition/test_csv.py
index 9317bec8cb..e3c34edbf5 100644
--- a/test_unstructured/partition/test_csv.py
+++ b/test_unstructured/partition/test_csv.py
@@ -200,11 +200,8 @@ def test_partition_csv_header():
)
table = elements[0]
- assert clean_extra_whitespace(table.text) == (
- "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
- )
+ assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
- assert "" in table.metadata.text_as_html
# ================================================================================================
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index a2a7ffee60..2a0c1ed30b 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.1-dev2" # pragma: no cover
+__version__ = "0.16.1-dev3" # pragma: no cover
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 6304712904..6006944c84 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -5,9 +5,9 @@
from typing import IO, Any, Iterator
import pandas as pd
-from lxml.html.soupparser import fromstring as soupparser_fromstring
from unstructured.chunking import add_chunking_strategy
+from unstructured.common.html_table import HtmlTable
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
@@ -46,7 +46,6 @@ def partition_csv(
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
"""
-
ctx = _CsvPartitioningContext.load(
file_path=filename,
file=file,
@@ -58,17 +57,18 @@ def partition_csv(
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
- html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
- text = soupparser_fromstring(html_text).text_content()
+ html_table = HtmlTable.from_html_text(
+ dataframe.to_html(index=False, header=include_header, na_rep="")
+ )
metadata = ElementMetadata(
filename=filename,
last_modified=ctx.last_modified,
- text_as_html=html_text if infer_table_structure else None,
+ text_as_html=html_table.html if infer_table_structure else None,
)
# -- a CSV file becomes a single `Table` element --
- return [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
+ return [Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
class _CsvPartitioningContext: