From 22974f61ceb5172ccba6c2d885a1d1c01ef93d08 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 7 Sep 2023 09:16:31 -0400 Subject: [PATCH] fix: separate elements by `
` tag in `partition_html` (#1314) ### Summary Closes #1230. Updates `partition_html` to split on `
` tags that appear within text elements. ### Testing The following is code previously produced one giant element on `main`. ```python from unstructured.partition.html import partition_html filename = "example-docs/ideas-page.html" elements = partition_html(filename=filename) len(elements) # Should be 4 print("\n\n".join([str(el) for el in elements)]) ``` The output should be: ```python January 2023 (Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. ``` --- CHANGELOG.md | 5 +- .../partition/test_html_partition.py | 26 ++++- .../Shared Documents/ideas-page.json | 39 ++++++- .../azure/spring-weather.html.json | 104 ++++++++++++++++-- .../box/nested-1/ideas-page.html.json | 36 +++++- .../box/nested-2/ideas-page.html.json | 36 +++++- .../confluence-diff/MFS/1540126.json | 40 ------- .../confluence-diff/MFS/1605928.json | 10 -- .../dropbox/nested-1/ideas-page.html.json | 36 +++++- .../dropbox/nested-2/ideas-page.html.json | 36 +++++- .../gcs/ideas-page.html.json | 36 +++++- .../gcs/nested-1/nested/ideas-page.html.json | 36 +++++- .../gcs/nested-2/nested/ideas-page.html.json | 36 +++++- .../github/test.html.json | 14 ++- unstructured/__version__.py | 2 +- unstructured/documents/html.py | 42 ++++++- 16 files changed, 426 insertions(+), 108 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a349c18a0b..dd43366509 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.13-dev2 +## 0.10.13-dev3 ### Enhancements @@ -10,6 +10,7 @@ ### Fixes +* `partition_html` breaks on `
` elements. * Ingest error handling to properly raise errors when wrapped ## 0.10.12 @@ -31,7 +32,7 @@ * Bump unstructured-inference * Avoid divide-by-zero errors swith `safe_division` (0.5.21) - + ## 0.10.11 ### Enhancements diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 8c725e427c..27172d228f 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -265,12 +265,28 @@ def test_partition_html_raises_with_too_many_specified(): partition_html(filename=filename, text=text) -def test_partition_html_on_ideas_page(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html") +def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"): elements = partition_html(filename=filename) - document_text = "\n\n".join([str(el) for el in elements]) - assert document_text.startswith("January 2023(Someone fed my essays into GPT") - assert document_text.endswith("whole new fractal buds.") + assert len(elements) == 4 + + assert elements[0] == Title("January 2023") + assert elements[0].metadata.emphasized_text_contents is None + assert elements[0].metadata.link_urls is None + + assert elements[1].text.startswith("(Someone fed my essays") + assert elements[1].text.endswith("I would have said.)") + assert len(elements[1].metadata.emphasized_text_contents) == 1 + assert len(elements[1].metadata.link_urls) == 1 + + assert elements[2].text.startswith("The way to get new ideas") + assert elements[2].text.endswith("the frontiers of knowledge.") + assert elements[2].metadata.emphasized_text_contents is None + assert elements[2].metadata.link_urls is None + + assert elements[3].text.startswith("Knowledge grows fractally") + assert elements[3].text.endswith("whole new fractal buds.") + assert elements[3].metadata.emphasized_text_contents is None + assert elements[3].metadata.link_urls is None def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch): diff --git a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json index 4626325182..ca891379e1 100644 --- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json +++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.json @@ -1,18 +1,27 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filename": "ideas-page.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filename": "ideas-page.html", "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -22,6 +31,28 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filename": "ideas-page.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filename": "ideas-page.html", + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json index 232ac31572..8929b48dc0 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json +++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json @@ -353,7 +353,7 @@ }, { "type": "NarrativeText", - "element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3", + "element_id": "073a8fd4fe21204eff8c0ca133f6993f", "metadata": { "data_source": {}, "filetype": "text/html", @@ -365,7 +365,17 @@ "strong" ] }, - "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”" + "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you." + }, + { + "type": "NarrativeText", + "element_id": "d97aee85f18639e200b29757e5783dad", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”" }, { "type": "NarrativeText", @@ -416,26 +426,98 @@ "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready." }, { - "type": "NarrativeText", - "element_id": "47d5d0d27a35a36d7467dfc8b6e089b3", + "type": "Title", + "element_id": "c9b4b8b324383371034a3682d0d712d2", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "http://www.commerce.gov" + ], + "link_texts": [ + "US Dept of Commerce" + ] + }, + "text": "US Dept of Commerce" + }, + { + "type": "Title", + "element_id": "668c4fe04cbbc45c7e91b0b675dd48a3", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "http://www.noaa.gov" + ], + "link_texts": [ + "National Oceanic and Atmospheric Administration" + ] + }, + "text": "National Oceanic and Atmospheric Administration" + }, + { + "type": "Title", + "element_id": "a5c0620dc25afae7e2761c210037b45c", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.weather.gov" + ], + "link_texts": [ + "National Weather Service" + ] + }, + "text": "National Weather Service" + }, + { + "type": "Title", + "element_id": "41f6e17bf5e9a407fcca74e902f802a0", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "News Around NOAA" + }, + { + "type": "Title", + "element_id": "d27040ad6074797db8e535d1fba3b5d8", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "1325 East West Highway" + }, + { + "type": "Address", + "element_id": "7ab3e0275d15e2c26b18983db0685ddb", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Silver Spring, MD 20910" + }, + { + "type": "Title", + "element_id": "1b0316a06a8f4d5b672669bb9f5b2877", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "http://www.commerce.gov", - "http://www.noaa.gov", - "https://www.weather.gov", "https://www.weather.gov/news/contact" ], "link_texts": [ - "US Dept of Commerce", - "National Oceanic and Atmospheric Administration", - "National Weather Service", "Comments? Questions? Please Contact Us." ] }, - "text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us." + "text": "Comments? Questions? Please Contact Us." }, { "type": "Title", diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json index a4569d05d6..42f3c097be 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1540126.json @@ -219,26 +219,6 @@ }, "text": "Nice to have:" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, { "type": "Title", "element_id": "7f999c0456e4e85cc028aa6ed90455d4", @@ -255,26 +235,6 @@ }, "text": "Not in scope:" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, { "type": "Title", "element_id": "e8b61a28d07e977379b42df455a1cde4", diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json index dc67c14c40..4008c53432 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605928.json @@ -133,16 +133,6 @@ }, "text": "Notes" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": {}, - "filetype": "text/html", - "page_number": 1 - }, - "text": "" - }, { "type": "Title", "element_id": "f158a8eaf72c7e9511d5e8ee03692652", diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json index 813dc26f3b..899445dac5 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -1,17 +1,25 @@ [ + { + "type": "Title", + "element_id": "17c1a6701c263407d0fcf7c3ebfb2986", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "January 2023" + }, { "type": "NarrativeText", - "element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", + "element_id": "6ea0e510b7ea64f87b55c1fe388cba7f", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ - "index.html", "https://twitter.com/stef/status/1617222428727586816" ], "link_texts": [ - null, null ], "emphasized_text_contents": [ @@ -21,6 +29,26 @@ "i" ] }, - "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." + "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)" + }, + { + "type": "NarrativeText", + "element_id": "a8ce0a2e7d66af2000e6c3bd36994411", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge." + }, + { + "type": "NarrativeText", + "element_id": "4eafbff98b81999dfbf3572440d22393", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index 1aecea366e..3af78803aa 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -35,15 +35,25 @@ }, "text": "Filename" }, + { + "type": "Title", + "element_id": "4112a488690bdbc1d39d5b78068eae9f", + "metadata": { + "data_source": {}, + "filetype": "text/html", + "page_number": 1 + }, + "text": "File Contents" + }, { "type": "NarrativeText", - "element_id": "43f65b1c5bd47774b25c72e2f96de300", + "element_id": "f89c9cf63bd2e72f560ee043d942a1e7", "metadata": { "data_source": {}, "filetype": "text/html", "page_number": 1 }, - "text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded" + "text": "Whatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded" }, { "type": "NarrativeText", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f219b96686..ee61d180d3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.13-dev2" # pragma: no cover +__version__ = "0.10.13-dev3" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 452f2d121a..d3298ad67b 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -36,9 +36,10 @@ LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"] HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"] TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"] +TEXTBREAK_TAGS: Final[List[str]] = ["br"] PAGEBREAK_TAGS: Final[List[str]] = ["hr"] +EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"] -EMPTY_TAGS: Final[List[str]] = ["br", "hr"] SECTION_TAGS: Final[List[str]] = ["div", "pre"] @@ -136,10 +137,18 @@ def _read(self) -> List[Page]: continue if _is_text_tag(tag_elem): - element = _parse_tag(tag_elem) - if element is not None: - page.elements.append(element) - descendanttag_elems = tuple(tag_elem.iterdescendants()) + if _has_break_tags(tag_elem): + flattened_elems = _unfurl_break_tags(tag_elem) + for _tag_elem in flattened_elems: + element = _parse_tag(_tag_elem) + if element is not None: + page.elements.append(element) + + else: + element = _parse_tag(tag_elem) + if element is not None: + page.elements.append(element) + descendanttag_elems = tuple(tag_elem.iterdescendants()) elif _is_container_with_text(tag_elem): links = _get_links_from_tag(tag_elem) @@ -385,6 +394,29 @@ def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> return text.strip() +def _has_break_tags(tag_elem: etree.Element) -> bool: + for descendant in tag_elem.iterdescendants(): + if descendant.tag in TEXTBREAK_TAGS: + return True + return False + + +def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]: + unfurled = [] + children = tag_elem.getchildren() + for child in children: + if not _has_break_tags(child): + unfurled.append(child) + else: + if child.text: + _tag_elem = etree.Element(child.tag) + _tag_elem.text = child.text + unfurled.append(_tag_elem) + unfurled.extend(_unfurl_break_tags(child)) + + return unfurled + + def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool: """Deteremines if a tag potentially contains narrative text.""" # NOTE(robinson) - Only consider elements with limited depth. Otherwise,