diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e6f99e968..60574a7723 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,15 @@ -## 0.10.1-dev3 +## 0.10.1 ### Enhancements * Bump unstructured-inference==0.5.12: - - fix to avoid trace for certain PDF's -* Bump unstructured-inference==0.5.11: - - better defaults for DPI for hi_res and Chipper -* Bump unstructured-inference==0.5.10: - - implement full-page OCR + - fix to avoid trace for certain PDF's (0.5.12) + - better defaults for DPI for hi_res and Chipper (0.5.11) + - implement full-page OCR (0.5.10) ### Features ### Fixes + * Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing) * Update document dependencies to include tesseract-lang for additional language support (required for tests to pass) @@ -18,6 +17,7 @@ ### Enhancements +* Add `include_header` kwarg to `partition_xlsx` and change default behavior to `True` * Update the `links` and `emphasized_texts` metadata fields ### Features @@ -26,6 +26,7 @@ * fix pdf partition of list items being detected as titles in OCR only mode + ## 0.9.3 ### Enhancements diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index e7af5f35a9..4389e71522 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -668,7 +668,7 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook. def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"): - elements = partition(filename=filename) + elements = partition(filename=filename, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 @@ -681,7 +681,7 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: - elements = partition(file=f) + elements = partition(file=f, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 @@ -774,7 +774,7 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx" @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_auto_partition_xls_from_filename(filename="example-docs/tests-example.xls"): - elements = partition(filename=filename) + elements = partition(filename=filename, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 3 diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index 291b097493..56f539b28a 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -9,7 +9,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"): - elements = partition_xlsx(filename=filename) + elements = partition_xlsx(filename=filename, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 @@ -23,7 +23,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx") def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xlsx"): - elements = partition_xlsx(filename=filename) + elements = partition_xlsx(filename=filename, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 1 assert clean_extra_whitespace(elements[0].text) == "🤠😅" @@ -32,16 +32,27 @@ def test_partition_xlsx_from_filename_with_emoji(filename="example-docs/emoji.xl def test_partition_xlsx_from_filename_with_metadata_filename( filename="example-docs/stanley-cups.xlsx", ): - elements = partition_xlsx(filename=filename, metadata_filename="test") + elements = partition_xlsx(filename=filename, metadata_filename="test", include_header=False) assert all(isinstance(element, Table) for element in elements) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.filename == "test" +def test_partition_xlsx_from_filename_with_header(filename="example-docs/stanley-cups.xlsx"): + elements = partition_xlsx(filename=filename, include_header=True) + assert all(isinstance(element, Table) for element in elements) + assert len(elements) == 2 + assert ( + clean_extra_whitespace(elements[0].text) + == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT + ) + assert "" in elements[0].metadata.text_as_html + + def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: - elements = partition_xlsx(file=f) + elements = partition_xlsx(file=f, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 @@ -55,15 +66,28 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"): def test_partition_xlsx_from_file_with_metadata_filename(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: - elements = partition_xlsx(file=f, metadata_filename="test") + elements = partition_xlsx(file=f, metadata_filename="test", include_header=False) assert all(isinstance(element, Table) for element in elements) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.filename == "test" +def test_partition_xlsx_from_file_with_header(filename="example-docs/stanley-cups.xlsx"): + with open(filename, "rb") as f: + elements = partition_xlsx(file=f, include_header=True) + + assert all(isinstance(element, Table) for element in elements) + assert len(elements) == 2 + assert ( + clean_extra_whitespace(elements[0].text) + == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT + ) + assert "" in elements[0].metadata.text_as_html + + def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley-cups.xlsx"): - elements = partition_xlsx(filename=filename, include_metadata=False) + elements = partition_xlsx(filename=filename, include_metadata=False, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 @@ -78,7 +102,7 @@ def test_partition_xlsx_filename_exclude_metadata(filename="example-docs/stanley def test_partition_xlsx_from_file_exclude_metadata(filename="example-docs/stanley-cups.xlsx"): with open(filename, "rb") as f: - elements = partition_xlsx(file=f, include_metadata=False) + elements = partition_xlsx(file=f, include_metadata=False, include_header=False) assert all(isinstance(element, Table) for element in elements) assert len(elements) == 2 diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json index 479f519baa..5faceb9efb 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/stanley-cups.json @@ -1,7 +1,7 @@ [ { "type": "Table", - "element_id": "c00fc0e5ac303c40f9089791e5e485b1", + "element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6", "metadata": { "data_source": { "record_locator": { @@ -16,13 +16,13 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "page_number": 1, "page_name": "Stanley Cups", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley CupsUnnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" }, - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + "text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" }, { "type": "Table", - "element_id": "31421b5cd94fedb10dc82738503b4505", + "element_id": "0699dddf33814117e04654068f5182f6", "metadata": { "data_source": { "record_locator": { @@ -37,8 +37,8 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "page_number": 2, "page_name": "Stanley Cups Since 67", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley Cups Since 67Unnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + "text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json index 4b327de5f7..2213a49c9c 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -1,26 +1,26 @@ [ { "type": "Table", - "element_id": "c00fc0e5ac303c40f9089791e5e485b1", + "element_id": "3e65b02bec20bb1056bd23a3b4ecd0f6", "metadata": { "data_source": {}, "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "page_number": 1, "page_name": "Stanley Cups", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley CupsUnnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" }, - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + "text": "\n\n\nStanley Cups\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" }, { "type": "Table", - "element_id": "31421b5cd94fedb10dc82738503b4505", + "element_id": "0699dddf33814117e04654068f5182f6", "metadata": { "data_source": {}, "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "page_number": 2, "page_name": "Stanley Cups Since 67", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Stanley Cups Since 67Unnamed: 1Unnamed: 2
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
" }, - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + "text": "\n\n\nStanley Cups Since 67\nUnnamed: 1\nUnnamed: 2\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json index 407253e220..1b7ea1fc78 100644 --- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.json @@ -1,41 +1,41 @@ [ { "type": "Table", - "element_id": "677f7fdbfa79de9d91e157663dd559cd", + "element_id": "0e2d044a26942328e2b8647574232e7f", "metadata": { "data_source": {}, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", "page_number": 1, "page_name": "Example Test", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MCWhat is 2+2?4correct3incorrectUnnamed: 6Unnamed: 7Unnamed: 8
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" }, - "text": "\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n" + "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\nUnnamed: 6\nUnnamed: 7\nUnnamed: 8\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n" }, { "type": "Table", - "element_id": "079ef3ee8c03cb36789b08765181ebc4", + "element_id": "5c56dd4c5b649b873ebd848312e66753", "metadata": { "data_source": {}, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", "page_number": 2, "page_name": "Format Abbr.", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
http://www.cmu.edu/blackboard
Question Format Abbreviations
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0Unnamed: 1
http://www.cmu.edu/blackboard
Question Format Abbreviations
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
" }, - "text": "\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n" + "text": "\n\n\nUnnamed: 0\nUnnamed: 1\n\n\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n" }, { "type": "Table", - "element_id": "c7b7d8780a970d589554c3784283b67e", + "element_id": "f48657c4eb70d98975e567248d0ef4bb", "metadata": { "data_source": {}, "filename": "tests-example.xls", "filetype": "application/vnd.ms-excel", "page_number": 3, "page_name": "Readme", - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
http://www.cmu.edu/blackboard
File Information
Source
http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls
Version
1.0 (January 2012)
Contact
bb-help@andrew.cmu.edu
About
This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions
" + "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0
http://www.cmu.edu/blackboard
File Information
Source
http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls
Version
1.0 (January 2012)
Contact
bb-help@andrew.cmu.edu
About
This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions
" }, - "text": "\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n" + "text": "\n\n\nUnnamed: 0\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n" } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 21690737da..b1fe0aa5f6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.1-dev3" # pragma: no cover +__version__ = "0.10.1" # pragma: no cover diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index fe21d8183e..2f4538210f 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -27,6 +27,7 @@ def partition_xlsx( metadata_filename: Optional[str] = None, include_metadata: bool = True, metadata_last_modified: Optional[str] = None, + include_header: bool = True, **kwargs, ) -> List[Element]: """Partitions Microsoft Excel Documents in .xlsx format into its document elements. @@ -41,6 +42,8 @@ def partition_xlsx( Determines whether or not metadata is included in the output. metadata_last_modified The day of the last modification + include_header + Determines whether or not header info info is included in text and medatada.text_as_html """ exactly_one(filename=filename, file=file) last_modification_date = None @@ -59,7 +62,7 @@ def partition_xlsx( page_number = 0 for sheet_name, table in sheets.items(): page_number += 1 - html_text = table.to_html(index=False, header=False, na_rep="") + html_text = table.to_html(index=False, header=include_header, na_rep="") text = soupparser_fromstring(html_text).text_content() if include_metadata: