Skip to content

Commit

Permalink
add additional docs and tests for multilanguage detection in docx, ep…
Browse files Browse the repository at this point in the history
…ub, odt, and rtf partitioners
  • Loading branch information
Coniferish committed Oct 3, 2023
1 parent b108e5a commit c36d434
Show file tree
Hide file tree
Showing 9 changed files with 615 additions and 8 deletions.
Binary file added example-docs/language-docs/eng_spa_mult.docx
Binary file not shown.
Binary file added example-docs/language-docs/eng_spa_mult.epub
Binary file not shown.
Binary file added example-docs/language-docs/eng_spa_mult.odt
Binary file not shown.
580 changes: 580 additions & 0 deletions example-docs/language-docs/eng_spa_mult.rtf

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,15 @@ def test_add_chunking_strategy_on_partition_docx(
assert chunk_elements != elements
assert chunk_elements == chunks


def test_partition_docx_element_metadata_has_languages():
filename = "example-docs/handbook-1p.docx"
elements = partition_docx(filename=filename)
assert elements[0].metadata.languages == ["eng"]



def test_partition_docx_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.docx"
elements = partition_docx(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
11 changes: 9 additions & 2 deletions test_unstructured/partition/epub/test_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,15 @@ def test_add_chunking_strategy_on_partition_epub_non_default(
assert chunk_elements != elements
assert chunk_elements == chunks


def test_partition_epub_element_metadata_has_languages():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename)
assert elements[0].metadata.languages == ["eng"]
assert elements[0].metadata.languages == ["eng"]


def test_partition_epub_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.epub"
elements = partition_epub(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng", "ron"], ["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
11 changes: 9 additions & 2 deletions test_unstructured/partition/odt/test_odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,15 @@ def test_add_chunking_strategy_on_partition_odt_non_default():
assert chunk_elements != elements
assert chunk_elements == chunks


def test_partition_odt_element_metadata_has_languages():
filename = "example-docs/fake.odt"
elements = partition_odt(filename=filename)
assert elements[0].metadata.languages == ["eng"]
assert elements[0].metadata.languages == ["eng"]


def test_partition_odt_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.odt"
elements = partition_odt(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
7 changes: 7 additions & 0 deletions test_unstructured/partition/pypandoc/test_rtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,10 @@ def test_partition_rtf_element_metadata_has_languages():
filename = "example-docs/fake-doc.rtf"
elements = partition_rtf(filename=filename)
assert elements[0].metadata.languages == ["eng"]


def test_partition_rtf_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.rtf"
elements = partition_rtf(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
4 changes: 2 additions & 2 deletions test_unstructured/partition/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,8 +539,8 @@ def test_partition_text_element_metadata_has_languages():
def test_partition_text_detects_multiple_elements_in_other_language():
filename = "example-docs/language-docs/eng_spa_mult.txt"
elements = partition_text(filename=filename, detect_language_per_element=True)
langs = {element.metadata.languages[0] for element in elements}
assert langs == {"eng", "spa"}
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


def test_partition_text_detects_more_than_3_languages():
Expand Down

0 comments on commit c36d434

Please sign in to comment.