Skip to content

Commit

Permalink
resolve PR review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish committed Oct 4, 2023
1 parent 808a1fe commit 2561928
Show file tree
Hide file tree
Showing 12 changed files with 71 additions and 82 deletions.
4 changes: 2 additions & 2 deletions test_unstructured/partition/xlsx/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,10 @@ def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"):
assert sum(isinstance(element, Table) for element in elements) == 3
assert len(elements) == 6


# NOTE (jennings) partition_xlsx returns a single TableElement,
# so we can only detect the language of that single element
def test_partition_xlsx_element_metadata_has_languages():
filename = "example-docs/stanley-cups.xlsx"
elements = partition_xlsx(filename=filename)
assert elements[0].metadata.languages == ["eng"]
assert elements[0].metadata.languages == ["eng"]
6 changes: 3 additions & 3 deletions unstructured/file_utils/metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import io
from dataclasses import dataclass, field
from typing import IO, Any, Dict, Final, Iterable, Iterator, List, Optional, Union
from typing import IO, Any, Dict, Final, Iterable, Iterator, List, Optional

import docx
import openpyxl
Expand Down Expand Up @@ -156,7 +156,7 @@ def _get_exif_datetime(exif_dict: Dict[str, Any], key: str) -> Optional[datetime


def apply_lang_metadata(
elements: Union[Iterable[Element], List[Element]],
elements: Iterable[Element],
languages: List[str],
detect_language_per_element: bool = False,
) -> Iterator[Element]:
Expand All @@ -182,7 +182,7 @@ def apply_lang_metadata(
and len(languages) == 1
and detect_language_per_element is False
):
# -- apply detected languge to each metadata --
# -- apply detected language to each metadata --
for e in elements:
e.metadata.languages = detected_languages
yield e
Expand Down
10 changes: 5 additions & 5 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ def partition_csv(
else:
metadata = ElementMetadata()

return list(
apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
),
elements = apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
)

return list(elements)
33 changes: 15 additions & 18 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,21 +203,19 @@ def partition_docx(
'The language parameter must be a list of language codes as strings, ex. ["eng"]',
)

elements = list(
apply_lang_metadata(
elements=_DocxPartitioner.iter_document_elements(
filename,
file,
metadata_filename,
include_page_breaks,
metadata_last_modified,
),
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = _DocxPartitioner.iter_document_elements(
filename,
file,
metadata_filename,
include_page_breaks,
metadata_last_modified,
)

return elements
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return list(elements)


class _DocxPartitioner:
Expand Down Expand Up @@ -292,8 +290,6 @@ def _iter_document_elements(self) -> Iterator[Element]:
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
# -- characteristic of a generator avoids repeated code to form interim results into lists.

# full_text = ""

for section_idx, section in enumerate(self._document.sections):
yield from self._iter_section_page_breaks(section_idx, section)
yield from self._iter_section_headers(section)
Expand Down Expand Up @@ -516,10 +512,11 @@ def page_is_odd() -> bool:
# -- predict when two page breaks will be needed and emit one of them. The second will be
# -- emitted by the rendered page-break to follow.

if start_type == WD_SECTION_START.EVEN_PAGE and not page_is_odd():
if start_type == WD_SECTION_START.EVEN_PAGE: # noqa
# -- on an even page we need two total, add one to supplement the rendered page break
# -- to follow. There is no "first-document-page" special case because 1 is odd.
yield from self._increment_page_number()
if not page_is_odd():
yield from self._increment_page_number()

elif start_type == WD_SECTION_START.ODD_PAGE:
# -- the first page of the document is an implicit "new" odd-page, so no page-break --
Expand Down
12 changes: 6 additions & 6 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,10 +465,10 @@ def partition_email(
element.metadata.attached_to_filename = metadata_filename or filename
all_elements.append(element)

return list(
apply_lang_metadata(
elements=all_elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = apply_lang_metadata(
elements=all_elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)

return list(elements)
12 changes: 5 additions & 7 deletions unstructured/partition/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,10 @@ def partition_epub(

elements.extend(section_elements)

elements = list(
apply_lang_metadata(
elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = apply_lang_metadata(
elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)

return elements
return list(elements)
11 changes: 5 additions & 6 deletions unstructured/partition/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,12 @@ def partition_msg(
element.metadata.attached_to_filename = metadata_filename or filename
elements.append(element)

return list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return list(elements)


def build_msg_metadata(
Expand Down
24 changes: 12 additions & 12 deletions unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,19 @@ def partition_pptx(
source_file = file or filename
assert source_file is not None

return list(
apply_lang_metadata(
elements=_PptxPartitioner.iter_presentation_elements(
source_file,
include_page_breaks,
include_slide_notes,
metadata_filename,
metadata_last_modified,
),
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = _PptxPartitioner.iter_presentation_elements(
source_file,
include_page_breaks,
include_slide_notes,
metadata_filename,
metadata_last_modified,
)
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return list(elements)


class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
Expand Down
12 changes: 5 additions & 7 deletions unstructured/partition/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,12 @@ def partition_text(
element.metadata = copy.deepcopy(metadata)
elements.append(element)

elements = list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return elements
return list(elements)


def element_from_text(
Expand Down
9 changes: 4 additions & 5 deletions unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,8 @@ def partition_tsv(
else:
metadata = ElementMetadata()

return list(
apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
),
elements = apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
)
return list(elements)
9 changes: 4 additions & 5 deletions unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,11 @@ def partition_xlsx(
element.metadata.languages = languages
elements.append(element)

return list(
apply_lang_metadata(
elements=elements,
languages=languages,
),
elements = apply_lang_metadata(
elements=elements,
languages=languages,
)
return list(elements)


def _get_connected_components(
Expand Down
11 changes: 5 additions & 6 deletions unstructured/partition/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,9 @@ def partition_xml(
element.metadata = copy.deepcopy(metadata)
elements.append(element)

return list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
elements = apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
return list(elements)

0 comments on commit 2561928

Please sign in to comment.