Skip to content

Commit

Permalink
rfctr(csv): minify HTML and table text is cct
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Oct 18, 2024
1 parent c85f29e commit ab7e734
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 109 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.1-dev2
## 0.16.1-dev3

### Enhancements

Expand All @@ -11,6 +11,7 @@
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.16.0

Expand Down
126 changes: 29 additions & 97 deletions test_unstructured/partition/test_constants.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TABLE = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"</table>"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = (
"<table>"
"<tr><td>Year</td><td>Month</td><td>Revenue</td><td>Costs</td><td/></tr>"
"<tr><td>2022</td><td>1</td><td>123</td><td>-123</td><td/></tr>"
"<tr><td>2023</td><td>2</td><td>143,1</td><td>-814,38</td><td/></tr>"
"<tr><td>2024</td><td>3</td><td>215,32</td><td>-11,08</td><td/></tr>"
"</table>"
)

EXPECTED_TABLE_WITH_EMOJI = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"<tr><td>👨\\U+1F3FB🔧</td><td>TOR</td><td>15</td></tr>"
"</table>"
)

EXPECTED_TABLE_XLSX = (
"<table>"
Expand Down Expand Up @@ -54,74 +54,6 @@
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Year</td>
<td>Month</td>
<td>Revenue</td>
<td>Costs</td>
<td></td>
</tr>
<tr>
<td>2022</td>
<td>1</td>
<td>123</td>
<td>-123</td>
<td></td>
</tr>
<tr>
<td>2023</td>
<td>2</td>
<td>143,1</td>
<td>-814,38</td>
<td></td>
</tr>
<tr>
<td>2024</td>
<td>3</td>
<td>215,32</td>
<td>-11,08</td>
<td></td>
</tr>
</tbody>
</table>"""

EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
<tr>
<td>👨\\U+1F3FB🔧</td>
<td>TOR</td>
<td>15</td>
</tr>
</tbody>
</table>"""

EXPECTED_XLS_TABLE = (
"<table><tr>"
"<td>MC</td>"
Expand Down
5 changes: 1 addition & 4 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,8 @@ def test_partition_csv_header():
)

table = elements[0]
assert clean_extra_whitespace(table.text) == (
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<thead>" in table.metadata.text_as_html


# ================================================================================================
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.1-dev2" # pragma: no cover
__version__ = "0.16.1-dev3" # pragma: no cover
12 changes: 6 additions & 6 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from typing import IO, Any, Iterator

import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

from unstructured.chunking import add_chunking_strategy
from unstructured.common.html_table import HtmlTable
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
Expand Down Expand Up @@ -46,7 +46,6 @@ def partition_csv(
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
"""

ctx = _CsvPartitioningContext.load(
file_path=filename,
file=file,
Expand All @@ -58,17 +57,18 @@ def partition_csv(
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)

html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
html_table = HtmlTable.from_html_text(
dataframe.to_html(index=False, header=include_header, na_rep="")
)

metadata = ElementMetadata(
filename=filename,
last_modified=ctx.last_modified,
text_as_html=html_text if infer_table_structure else None,
text_as_html=html_table.html if infer_table_structure else None,
)

# -- a CSV file becomes a single `Table` element --
return [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
return [Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]


class _CsvPartitioningContext:
Expand Down

0 comments on commit ab7e734

Please sign in to comment.