Skip to content

Commit

Permalink
Feat/1060 update metadata fields (#1099)
Browse files Browse the repository at this point in the history
Closes Github Issue #1060.

* update the metadata field links
* update the metadata field emphasized_texts
  • Loading branch information
christinestraub authored Aug 16, 2023
1 parent fe5048a commit 0e887cc
Show file tree
Hide file tree
Showing 21 changed files with 536 additions and 633 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.9.4-dev0
## 0.10.0

### Enhancements

* Update the `links` and `emphasized_texts` metadata fields

### Features

### Fixes
Expand Down
81 changes: 53 additions & 28 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import (
_extract_contents_and_tags,
_get_emphasized_texts_from_paragraph,
_get_emphasized_texts_from_table,
partition_docx,
Expand Down Expand Up @@ -63,6 +64,26 @@ def expected_elements():
]


@pytest.fixture()
def expected_emphasized_texts():
return [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]


@pytest.fixture()
def expected_emphasized_text_contents():
return ["bold", "italic", "bold-italic", "bold-italic"]


@pytest.fixture()
def expected_emphasized_text_tags():
return ["b", "i", "b", "i"]


def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
Expand Down Expand Up @@ -293,19 +314,14 @@ def test_partition_docx_from_file_without_metadata_date(


def test_get_emphasized_texts_from_paragraph(
expected_emphasized_texts,
filename="example-docs/fake-doc-emphasized-text.docx",
):
expected = [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
document = docx.Document(filename)
paragraph = document.paragraphs[1]
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
assert paragraph.text == "I am a bold italic bold-italic text."
assert emphasized_texts == expected
assert emphasized_texts == expected_emphasized_texts

paragraph = document.paragraphs[2]
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
Expand All @@ -319,18 +335,29 @@ def test_get_emphasized_texts_from_paragraph(


def test_get_emphasized_texts_from_table(
expected_emphasized_texts,
filename="example-docs/fake-doc-emphasized-text.docx",
):
expected = [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
document = docx.Document(filename)
table = document.tables[0]
emphasized_texts = _get_emphasized_texts_from_table(table)
assert emphasized_texts == expected
assert emphasized_texts == expected_emphasized_texts


def test_extract_contents_and_tags(
expected_emphasized_texts,
expected_emphasized_text_contents,
expected_emphasized_text_tags,
):
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
expected_emphasized_texts,
)
assert emphasized_text_contents == expected_emphasized_text_contents
assert emphasized_text_tags == expected_emphasized_text_tags

emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
assert emphasized_text_contents is None
assert emphasized_text_tags is None


@pytest.mark.parametrize(
Expand All @@ -340,24 +367,22 @@ def test_get_emphasized_texts_from_table(
("fake-doc-emphasized-text.doc", partition_doc),
],
)
def test_partition_docx_grabs_emphasized_texts(filename, partition_func):
def test_partition_docx_grabs_emphasized_texts(
filename,
partition_func,
expected_emphasized_text_contents,
expected_emphasized_text_tags,
):
elements = partition_func(filename=f"example-docs/{filename}")

assert isinstance(elements[0], Table)
assert elements[0].metadata.emphasized_texts == [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags

assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
assert elements[1].metadata.emphasized_texts == [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags

assert elements[2] == NarrativeText("I am a normal text.")
assert elements[2].metadata.emphasized_texts is None
assert elements[2].metadata.emphasized_text_contents is None
assert elements[2].metadata.emphasized_text_tags is None
57 changes: 23 additions & 34 deletions test_unstructured/partition/test_html_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,34 +455,24 @@ def test_partition_html_grabs_links():
elements = partition_html(text=html_text)

assert elements[0] == NarrativeText("Hello there I am a very important link!")
assert elements[0].metadata.links == [
{
"text": "very important link!",
"url": "/link",
},
]
assert elements[0].metadata.link_urls == ["/link"]
assert elements[0].metadata.link_texts == ["very important link!"]

assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.links is None
assert elements[1].metadata.link_urls is None
assert elements[1].metadata.link_texts is None

assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.links == [
{
"text": "Parrots",
"url": "https://en.wikipedia.org/wiki/Parrot",
},
]
assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
assert elements[2].metadata.link_texts == ["Parrots"]

assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.links is None
assert elements[3].metadata.link_urls is None
assert elements[3].metadata.link_texts is None

assert elements[4] == Title("A lone link!")
assert elements[4].metadata.links == [
{
"text": "A lone link!",
"url": "/loner",
},
]
assert elements[4].metadata.link_urls == ["/loner"]
assert elements[4].metadata.link_texts == ["A lone link!"]


def test_partition_html_from_filename_with_skip_headers_and_footers(
Expand Down Expand Up @@ -570,26 +560,25 @@ def test_partition_html_grabs_emphasized_texts():
elements = partition_html(text=html_text)

assert elements[0] == NarrativeText("Hello there I am a very important text!")
assert elements[0].metadata.emphasized_texts == [
{"text": "important", "tag": "strong"},
]
assert elements[0].metadata.emphasized_text_contents == ["important"]
assert elements[0].metadata.emphasized_text_tags == ["strong"]

assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.emphasized_texts == [
{"text": "list", "tag": "span"},
{"text": "my favorite things", "tag": "b"},
{"text": "favorite", "tag": "i"},
assert elements[1].metadata.emphasized_text_contents == [
"list",
"my favorite things",
"favorite",
]
assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]

assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.emphasized_texts == [
{"text": "Parrots", "tag": "em"},
]
assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
assert elements[2].metadata.emphasized_text_tags == ["em"]

assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.emphasized_texts is None
assert elements[3].metadata.emphasized_text_contents is None
assert elements[3].metadata.emphasized_text_tags is None

assert elements[4] == Title("A lone span text!")
assert elements[4].metadata.emphasized_texts == [
{"text": "A lone span text!", "tag": "span"},
]
assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
assert elements[4].metadata.emphasized_text_tags == ["span"]
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,19 @@
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
Expand Down
Loading

0 comments on commit 0e887cc

Please sign in to comment.