Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rfctr(csv): minify HTML and table text is cct #3733

Merged
merged 3 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.1-dev2
## 0.16.1-dev3

### Enhancements

Expand All @@ -11,6 +11,7 @@
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.16.0

Expand Down
126 changes: 29 additions & 97 deletions test_unstructured/partition/test_constants.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TABLE = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"</table>"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = (
"<table>"
"<tr><td>Year</td><td>Month</td><td>Revenue</td><td>Costs</td><td/></tr>"
"<tr><td>2022</td><td>1</td><td>123</td><td>-123</td><td/></tr>"
"<tr><td>2023</td><td>2</td><td>143,1</td><td>-814,38</td><td/></tr>"
"<tr><td>2024</td><td>3</td><td>215,32</td><td>-11,08</td><td/></tr>"
"</table>"
)

EXPECTED_TABLE_WITH_EMOJI = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"<tr><td>👨\\U+1F3FB🔧</td><td>TOR</td><td>15</td></tr>"
"</table>"
)

EXPECTED_TABLE_XLSX = (
"<table>"
Expand Down Expand Up @@ -54,74 +54,6 @@
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Year</td>
<td>Month</td>
<td>Revenue</td>
<td>Costs</td>
<td></td>
</tr>
<tr>
<td>2022</td>
<td>1</td>
<td>123</td>
<td>-123</td>
<td></td>
</tr>
<tr>
<td>2023</td>
<td>2</td>
<td>143,1</td>
<td>-814,38</td>
<td></td>
</tr>
<tr>
<td>2024</td>
<td>3</td>
<td>215,32</td>
<td>-11,08</td>
<td></td>
</tr>
</tbody>
</table>"""

EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
<tr>
<td>👨\\U+1F3FB🔧</td>
<td>TOR</td>
<td>15</td>
</tr>
</tbody>
</table>"""

EXPECTED_XLS_TABLE = (
"<table><tr>"
"<td>MC</td>"
Expand Down
5 changes: 1 addition & 4 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,8 @@ def test_partition_csv_header():
)

table = elements[0]
assert clean_extra_whitespace(table.text) == (
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<thead>" in table.metadata.text_as_html


# ================================================================================================
Expand Down
43 changes: 19 additions & 24 deletions test_unstructured/partition/test_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv

Expand All @@ -31,21 +30,20 @@
def test_partition_tsv_from_filename(filename: str, expected_text: str, expected_table: str):
elements = partition_tsv(example_doc_path(filename), include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename == filename
table = elements[0]
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename == filename for e in elements)


def test_partition_tsv_from_filename_with_metadata_filename():
elements = partition_tsv(
example_doc_path("stanley-cups.tsv"), metadata_filename="test", include_header=False
)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(e.metadata.filename == "test" for e in elements)


@pytest.mark.parametrize(
Expand All @@ -59,21 +57,20 @@ def test_partition_tsv_from_file(filename: str, expected_text: str, expected_tab
with open(example_doc_path(filename), "rb") as f:
elements = partition_tsv(file=f, include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename is None
table = elements[0]
assert isinstance(table, Table)
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename is None for e in elements)


def test_partition_tsv_from_file_with_metadata_filename():
with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
elements = partition_tsv(file=f, metadata_filename="test", include_header=False)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(element.metadata.filename == "test" for element in elements)


# -- .metadata.last_modified ---------------------------------------------------------------------
Expand Down Expand Up @@ -142,12 +139,10 @@ def test_partition_tsv_header():
example_doc_path("stanley-cups.tsv"), strategy="fast", include_header=True
)

e = elements[0]
assert (
clean_extra_whitespace(e.text) == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert e.metadata.text_as_html is not None
assert "<thead>" in e.metadata.text_as_html
table = elements[0]
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<table>" in table.metadata.text_as_html


def test_partition_tsv_supports_chunking_strategy_while_partitioning():
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading
Loading