Skip to content

Commit

Permalink
improve docstrings and remove detect_per_element kwarg from partition…
Browse files Browse the repository at this point in the history
…ers that only return one element (xlsx, csv, tsv)
  • Loading branch information
Coniferish committed Oct 4, 2023
1 parent 2879f21 commit b1c4048
Show file tree
Hide file tree
Showing 18 changed files with 60 additions and 60 deletions.
15 changes: 3 additions & 12 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def partition_csv(
metadata_last_modified: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
detect_language_per_element: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .csv format into its document elements.
Expand All @@ -49,11 +48,9 @@ def partition_csv(
include_metadata
Determines whether or not metadata is included in the output.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -75,11 +72,6 @@ def partition_csv(

html_text = table.to_html(index=False, header=False, na_rep="")
text = soupparser_fromstring(html_text).text_content()
# languages = detect_languages(
# text=text,
# languages=languages,
# detect_language_per_element=detect_language_per_element,
# )

if include_metadata:
metadata = ElementMetadata(
Expand All @@ -95,6 +87,5 @@ def partition_csv(
apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
5 changes: 3 additions & 2 deletions unstructured/partition/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ def partition_doc(
filter that is required when using LibreOffice7. Pass in None
if you do not want to apply any filter.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
10 changes: 6 additions & 4 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ def convert_and_partition_docx(
Determines whether or not metadata is included in the metadata attribute on the elements in
the output.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down Expand Up @@ -187,8 +188,9 @@ def partition_docx(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,9 @@ def partition_email(
The minimum number of characters to include in a partition. Only applies if
processing the text/plain content.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ def partition_epub(
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
10 changes: 6 additions & 4 deletions unstructured/partition/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ def partition_html(
skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down Expand Up @@ -183,8 +184,9 @@ def convert_and_partition_html(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ def partition_md(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def partition_msg(
The minimum number of characters to include in a partition. Only applies if
processing text/plain content.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def partition_odt(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/org.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def partition_org(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/ppt.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ def partition_ppt(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ def partition_pptx(
include_slide_notes
If True, includes the slide notes as element
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def partition_rst(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/rtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def partition_rtf(
metadata_last_modified
The last modified date for the document.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
5 changes: 3 additions & 2 deletions unstructured/partition/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,9 @@ def partition_text(
include_metadata
Determines whether or not metadata is included in the output.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down
10 changes: 3 additions & 7 deletions unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def partition_tsv(
metadata_last_modified: Optional[str] = None,
include_metadata: bool = True,
languages: List[str] = ["auto"],
detect_language_per_element: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions TSV files into document elements.
Expand All @@ -45,11 +44,9 @@ def partition_tsv(
metadata_last_modified
The day of the last modification.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
"""
exactly_one(filename=filename, file=file)

Expand Down Expand Up @@ -86,6 +83,5 @@ def partition_tsv(
apply_lang_metadata(
[Table(text=text, metadata=metadata)],
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
10 changes: 3 additions & 7 deletions unstructured/partition/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def partition_xlsx(
metadata_last_modified: Optional[str] = None,
include_header: bool = True,
languages: List[str] = ["auto"],
detect_language_per_element: bool = False,
**kwargs,
) -> List[Element]:
"""Partitions Microsoft Excel Documents in .xlsx format into its document elements.
Expand All @@ -50,11 +49,9 @@ def partition_xlsx(
include_header
Determines whether or not header info info is included in text and medatada.text_as_html
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
"""
exactly_one(filename=filename, file=file)

Expand Down Expand Up @@ -100,6 +97,5 @@ def partition_xlsx(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
5 changes: 3 additions & 2 deletions unstructured/partition/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,9 @@ def partition_xml(
metadata_last_modified
The day of the last modification.
languages
Detected language of a text using naive Bayesian filter. Multiple languages indicates text
could be in either language.
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
Expand Down

0 comments on commit b1c4048

Please sign in to comment.