Skip to content

Commit

Permalink
test: update tests for checking data_origin
Browse files Browse the repository at this point in the history
  • Loading branch information
Benjamin Torres committed Oct 3, 2023
1 parent a73bf65 commit 61d3a4a
Show file tree
Hide file tree
Showing 12 changed files with 35 additions and 15 deletions.
4 changes: 3 additions & 1 deletion test_unstructured/partition/csv/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from unstructured.documents.elements import Table
from unstructured.partition.csv import partition_csv
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

EXPECTED_FILETYPE = "text/csv"
Expand Down Expand Up @@ -59,7 +60,8 @@ def test_partition_csv_from_file(filename, expected_text, expected_table):
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
assert elements[0].metadata.filename is None
assert {element.metadata.data_origin for element in elements} == {"csv"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"csv"}


def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import _DocxPartitioner, partition_docx
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json


Expand Down Expand Up @@ -106,7 +107,8 @@ def test_partition_docx_from_filename(
assert elements[0].metadata.page_number is None
for element in elements:
assert element.metadata.filename == "mock_document.docx"
assert {element.metadata.data_origin for element in elements} == {"docx"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"docx"}


def test_partition_docx_from_filename_with_metadata_filename(mock_document, tmpdir):
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/epub/test_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unstructured.documents.elements import Table, Text
from unstructured.partition.epub import partition_epub
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -33,7 +34,8 @@ def test_partition_epub_from_filename():
assert element.metadata.section is not None
all_sections.add(element.metadata.section)
assert all_sections == expected_sections
assert {element.metadata.data_origin for element in elements} == {"epub"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"epub"}


def test_partition_epub_from_filename_returns_table_in_elements():
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/markdown/test_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from unstructured.documents.elements import Title
from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand All @@ -21,7 +22,8 @@ def test_partition_md_from_filename():
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "README.md"
assert {element.metadata.data_origin for element in elements} == {"md"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"md"}


def test_partition_md_from_filename_returns_uns_elements():
Expand Down
5 changes: 3 additions & 2 deletions test_unstructured/partition/msg/test_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from unstructured.partition.json import partition_json
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
from unstructured.partition.text import partition_text
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -55,12 +56,12 @@ def test_partition_msg_from_filename():
subject="Test Email",
filetype="application/vnd.ms-outlook",
parent_id=parent_id,
data_origin="msg",
).to_dict()
)
for element in elements:
assert element.metadata.filename == "fake-email.msg"
assert {element.metadata.data_origin for element in elements} == {"msg"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"msg"}


def test_partition_msg_from_filename_returns_uns_elements():
Expand Down
8 changes: 5 additions & 3 deletions test_unstructured/partition/odt/test_odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unstructured.documents.elements import Table, Title
from unstructured.partition.json import partition_json
from unstructured.partition.odt import partition_odt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand All @@ -26,9 +27,10 @@ def test_partition_odt_from_filename():
]
for element in elements:
assert element.metadata.filename == "fake.odt"
assert {element.metadata.data_origin for element in elements} == {
"docx",
} # this file is processed by docx backend
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {
"docx",
} # this file is processed by docx backend


def test_partition_odt_from_filename_with_metadata_filename():
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/pdf-image/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from unstructured.chunking.title import chunk_by_title
from unstructured.partition import image, pdf
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -245,7 +246,8 @@ def test_partition_image_default_strategy_hi_res():
assert elements[0].metadata.coordinates is not None
assert elements[0].metadata.detection_class_prob is not None
assert isinstance(elements[0].metadata.detection_class_prob, float)
assert {element.metadata.data_origin for element in elements} == {"image"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"image"}


def test_partition_image_metadata_date(
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
from unstructured.partition import pdf, strategies
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json


Expand Down Expand Up @@ -132,7 +133,8 @@ def _test(result):
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected
assert {element.metadata.data_origin for element in result} == {origin}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in result} == {origin}

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy)
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/pptx/test_ppt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unstructured.documents.elements import ListItem, NarrativeText, Title
from unstructured.partition.json import partition_json
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand All @@ -28,7 +29,8 @@ def test_partition_ppt_from_filename():
assert elements == EXPECTED_PPT_OUTPUT
for element in elements:
assert element.metadata.filename == "fake-power-point.ppt"
assert {element.metadata.data_origin for element in elements} == {"pptx"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"pptx"}


def test_partition_ppt_from_filename_with_metadata_filename():
Expand Down
1 change: 0 additions & 1 deletion test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,6 @@ def test_partition_email_from_filename_has_metadata():
subject="Test Email",
filetype="message/rfc822",
parent_id=parent_id,
data_origin="email",
).to_dict()
)
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
partition_text,
split_content_to_fit_max,
)
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json

DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand Down Expand Up @@ -67,7 +68,8 @@ def test_partition_text_from_filename(filename, encoding):
assert elements == EXPECTED_OUTPUT
for element in elements:
assert element.metadata.filename == filename
assert {element.metadata.data_origin for element in elements} == {"text"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"text"}


def test_partition_text_from_filename_with_metadata_filename():
Expand Down
4 changes: 3 additions & 1 deletion test_unstructured/partition/test_xml_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import NarrativeText, Title
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_json

Expand All @@ -22,7 +23,8 @@ def test_partition_xml_from_filename(filename):

assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
assert {element.metadata.data_origin for element in elements} == {"xml"}
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.data_origin for element in elements} == {"xml"}


def test_partition_xml_from_filename_with_metadata_filename():
Expand Down

0 comments on commit 61d3a4a

Please sign in to comment.