Skip to content

Commit

Permalink
enhancement: Add include_header kwarg for xlsx, default True(#1125)
Browse files Browse the repository at this point in the history
Closes Github issue #1121

Adds include_header kwarg to partition_xlsx and change default behavior to True.
  • Loading branch information
Coniferish authored Aug 17, 2023
1 parent 22c12ef commit 9f7bd61
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 39 deletions.
13 changes: 7 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
## 0.10.1-dev3
## 0.10.1

### Enhancements
* Bump unstructured-inference==0.5.12:
- fix to avoid trace for certain PDF's
* Bump unstructured-inference==0.5.11:
- better defaults for DPI for hi_res and Chipper
* Bump unstructured-inference==0.5.10:
- implement full-page OCR
- fix to avoid trace for certain PDF's (0.5.12)
- better defaults for DPI for hi_res and Chipper (0.5.11)
- implement full-page OCR (0.5.10)

### Features

### Fixes

* Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing)
* Update document dependencies to include tesseract-lang for additional language support (required for tests to pass)

## 0.10.0

### Enhancements

* Add `include_header` kwarg to `partition_xlsx` and change default behavior to `True`
* Update the `links` and `emphasized_texts` metadata fields

### Features
Expand All @@ -26,6 +26,7 @@

* fix pdf partition of list items being detected as titles in OCR only mode


## 0.9.3

### Enhancements
Expand Down
6 changes: 3 additions & 3 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.


def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition(filename=filename)
elements = partition(filename=filename, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand All @@ -681,7 +681,7 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x

def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition(file=f)
elements = partition(file=f, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand Down Expand Up @@ -774,7 +774,7 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"

@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"):
elements = partition(filename=filename)
elements = partition(filename=filename, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 3
Expand Down
38 changes: 31 additions & 7 deletions test_unstructured/partition/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename)
elements = partition_xlsx(filename=filename, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand All @@ -23,7 +23,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx")


def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"):
elements = partition_xlsx(filename=filename)
elements = partition_xlsx(filename=filename, include_header=False)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
Expand All @@ -32,16 +32,27 @@ def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xl
def test_partition_xlsx_from_filename_with_metadata_filename(
filename="example-docs/stanley-cups.xlsx",
):
elements = partition_xlsx(filename=filename, metadata_filename="test")
elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"


def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_header=True)
assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
)
assert "<thead>" in elements[0].metadata.text_as_html


def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f)
elements = partition_xlsx(file=f, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand All @@ -55,15 +66,28 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):

def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, metadata_filename="test")
elements = partition_xlsx(file=f, metadata_filename="test", include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.filename == "test"


def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_header=True)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
assert (
clean_extra_whitespace(elements[0].text)
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT
)
assert "<thead>" in elements[0].metadata.text_as_html


def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
elements = partition_xlsx(filename=filename, include_metadata=False)
elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand All @@ -78,7 +102,7 @@ def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley

def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
with open(filename, "rb") as f:
elements = partition_xlsx(file=f, include_metadata=False)
elements = partition_xlsx(file=f, include_metadata=False, include_header=False)

assert all(isinstance(element, Table) for element in elements)
assert len(elements) == 2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[
{
"type": "Table",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"metadata": {
"data_source": {
"record_locator": {
Expand All @@ -16,13 +16,13 @@
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"element_id": "0699dddf33814117e04654068f5182f6",
"metadata": {
"data_source": {
"record_locator": {
Expand All @@ -37,8 +37,8 @@
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
[
{
"type": "Table",
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
"element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 1,
"page_name": "Stanley Cups",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
"text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
},
{
"type": "Table",
"element_id": "31421b5cd94fedb10dc82738503b4505",
"element_id": "0699dddf33814117e04654068f5182f6",
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"page_number": 2,
"page_name": "Stanley Cups Since 67",
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>Stanley Cups Since 67</th>\n <th>Unnamed: 1</th>\n <th>Unnamed: 2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
},
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
"text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
}
]
Loading

0 comments on commit 9f7bd61

Please sign in to comment.