Skip to content

Commit

Permalink
Merge branch 'chore/bump-inference' of github.com:Unstructured-IO/uns…
Browse files Browse the repository at this point in the history
…tructured into chore/bump-inference
  • Loading branch information
badGarnet committed Oct 20, 2024
2 parents 4de9924 + 8875eb7 commit 319575d
Show file tree
Hide file tree
Showing 41 changed files with 1,000 additions and 955 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.1-dev2
## 0.16.1-dev3

### Enhancements

Expand All @@ -13,6 +13,7 @@
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.16.0

Expand Down
126 changes: 29 additions & 97 deletions test_unstructured/partition/test_constants.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TABLE = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"</table>"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = (
"<table>"
"<tr><td>Year</td><td>Month</td><td>Revenue</td><td>Costs</td><td/></tr>"
"<tr><td>2022</td><td>1</td><td>123</td><td>-123</td><td/></tr>"
"<tr><td>2023</td><td>2</td><td>143,1</td><td>-814,38</td><td/></tr>"
"<tr><td>2024</td><td>3</td><td>215,32</td><td>-11,08</td><td/></tr>"
"</table>"
)

EXPECTED_TABLE_WITH_EMOJI = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"<tr><td>👨\\U+1F3FB🔧</td><td>TOR</td><td>15</td></tr>"
"</table>"
)

EXPECTED_TABLE_XLSX = (
"<table>"
Expand Down Expand Up @@ -54,74 +54,6 @@
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Year</td>
<td>Month</td>
<td>Revenue</td>
<td>Costs</td>
<td></td>
</tr>
<tr>
<td>2022</td>
<td>1</td>
<td>123</td>
<td>-123</td>
<td></td>
</tr>
<tr>
<td>2023</td>
<td>2</td>
<td>143,1</td>
<td>-814,38</td>
<td></td>
</tr>
<tr>
<td>2024</td>
<td>3</td>
<td>215,32</td>
<td>-11,08</td>
<td></td>
</tr>
</tbody>
</table>"""

EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
<tr>
<td>👨\\U+1F3FB🔧</td>
<td>TOR</td>
<td>15</td>
</tr>
</tbody>
</table>"""

EXPECTED_XLS_TABLE = (
"<table><tr>"
"<td>MC</td>"
Expand Down
5 changes: 1 addition & 4 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,8 @@ def test_partition_csv_header():
)

table = elements[0]
assert clean_extra_whitespace(table.text) == (
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<thead>" in table.metadata.text_as_html


# ================================================================================================
Expand Down
43 changes: 19 additions & 24 deletions test_unstructured/partition/test_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv

Expand All @@ -31,21 +30,20 @@
def test_partition_tsv_from_filename(filename: str, expected_text: str, expected_table: str):
elements = partition_tsv(example_doc_path(filename), include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename == filename
table = elements[0]
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename == filename for e in elements)


def test_partition_tsv_from_filename_with_metadata_filename():
elements = partition_tsv(
example_doc_path("stanley-cups.tsv"), metadata_filename="test", include_header=False
)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(e.metadata.filename == "test" for e in elements)


@pytest.mark.parametrize(
Expand All @@ -59,21 +57,20 @@ def test_partition_tsv_from_file(filename: str, expected_text: str, expected_tab
with open(example_doc_path(filename), "rb") as f:
elements = partition_tsv(file=f, include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename is None
table = elements[0]
assert isinstance(table, Table)
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename is None for e in elements)


def test_partition_tsv_from_file_with_metadata_filename():
with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
elements = partition_tsv(file=f, metadata_filename="test", include_header=False)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(element.metadata.filename == "test" for element in elements)


# -- .metadata.last_modified ---------------------------------------------------------------------
Expand Down Expand Up @@ -142,12 +139,10 @@ def test_partition_tsv_header():
example_doc_path("stanley-cups.tsv"), strategy="fast", include_header=True
)

e = elements[0]
assert (
clean_extra_whitespace(e.text) == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert e.metadata.text_as_html is not None
assert "<thead>" in e.metadata.text_as_html
table = elements[0]
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<table>" in table.metadata.text_as_html


def test_partition_tsv_supports_chunking_strategy_while_partitioning():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,20 @@
"type": "ListItem"
},
{
"element_id": "6277cd91869e10d6256f362b08d3e789",
"element_id": "f0f0586caeb3af4284c1b367a5269d27",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "452",
"type": "Header"
},
{
"element_id": "ac79570be092923eb29899f64281c3b3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -351,7 +364,7 @@
"type": "Table"
},
{
"element_id": "22b8448fe36b3ccd06d1d8e4ea2dc1ea",
"element_id": "13fd694e1ff862d163b840a246964e58",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -364,7 +377,7 @@
"type": "Title"
},
{
"element_id": "f2b57562924402b85f6eb07925ea1654",
"element_id": "5f1c4074c1b5d641b724b99be6f5ddfd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -377,7 +390,7 @@
"type": "NarrativeText"
},
{
"element_id": "d9f6efffd49ef59e671206bfb5f094de",
"element_id": "afed004de4c50d761640b6c18729a988",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -390,7 +403,7 @@
"type": "ListItem"
},
{
"element_id": "2a1e46bc589c5eca777b657e141e824b",
"element_id": "f93d89ccb971e2b60f44afbf710673c6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -403,7 +416,7 @@
"type": "NarrativeText"
},
{
"element_id": "2c42182c07ecdb96362b534a8fad4d59",
"element_id": "cb6e8acb9c24820b59f8973cc236ef35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -416,7 +429,7 @@
"type": "ListItem"
},
{
"element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f",
"element_id": "5964ede27be8850de7a13e0dd32c1b21",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -429,7 +442,7 @@
"type": "NarrativeText"
},
{
"element_id": "07cdb1623f501ea23a343039300178cc",
"element_id": "e1f7e635d8739a97d8d0000ba8004f61",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -442,7 +455,7 @@
"type": "ListItem"
},
{
"element_id": "4bf8165bcb21c5296b741ba0f9e38f93",
"element_id": "deb8964830ba1f9dd1eec7b08bd3ea19",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -455,7 +468,7 @@
"type": "Title"
},
{
"element_id": "85918ce2a03e9f236137a0fe72985af0",
"element_id": "be270e13c935334fa3b17b13066d639b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -468,7 +481,7 @@
"type": "NarrativeText"
},
{
"element_id": "93537983496efa695cfc65ad895d9412",
"element_id": "5c97405ec921495b23d2b400516cbd06",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand All @@ -481,7 +494,7 @@
"type": "Image"
},
{
"element_id": "76b94e78b638b79374e266284c1a0d83",
"element_id": "7956ee39ac5e080a362967e2f6a5753e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
Expand Down
Loading

0 comments on commit 319575d

Please sign in to comment.