From b1c4048eee2207b6a254b4675e01ebaf579fce93 Mon Sep 17 00:00:00 2001 From: coniferish Date: Wed, 4 Oct 2023 12:12:12 -0500 Subject: [PATCH] improve docstrings and remove detect_per_element kwarg from partitioners that only return one element (xlsx, csv, tsv) --- unstructured/partition/csv.py | 15 +++------------ unstructured/partition/doc.py | 5 +++-- unstructured/partition/docx.py | 10 ++++++---- unstructured/partition/email.py | 5 +++-- unstructured/partition/epub.py | 5 +++-- unstructured/partition/html.py | 10 ++++++---- unstructured/partition/md.py | 5 +++-- unstructured/partition/msg.py | 5 +++-- unstructured/partition/odt.py | 5 +++-- unstructured/partition/org.py | 5 +++-- unstructured/partition/ppt.py | 5 +++-- unstructured/partition/pptx.py | 5 +++-- unstructured/partition/rst.py | 5 +++-- unstructured/partition/rtf.py | 5 +++-- unstructured/partition/text.py | 5 +++-- unstructured/partition/tsv.py | 10 +++------- unstructured/partition/xlsx.py | 10 +++------- unstructured/partition/xml.py | 5 +++-- 18 files changed, 60 insertions(+), 60 deletions(-) diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 837326dfb9..a38c29c728 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -31,7 +31,6 @@ def partition_csv( metadata_last_modified: Optional[str] = None, include_metadata: bool = True, languages: List[str] = ["auto"], - detect_language_per_element: bool = False, **kwargs, ) -> List[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. @@ -49,11 +48,9 @@ def partition_csv( include_metadata Determines whether or not metadata is included in the output. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. """ exactly_one(filename=filename, file=file) @@ -75,11 +72,6 @@ def partition_csv( html_text = table.to_html(index=False, header=False, na_rep="") text = soupparser_fromstring(html_text).text_content() - # languages = detect_languages( - # text=text, - # languages=languages, - # detect_language_per_element=detect_language_per_element, - # ) if include_metadata: metadata = ElementMetadata( @@ -95,6 +87,5 @@ def partition_csv( apply_lang_metadata( [Table(text=text, metadata=metadata)], languages=languages, - detect_language_per_element=detect_language_per_element, ), ) diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 07b34bd496..9a7b991591 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -45,8 +45,9 @@ def partition_doc( filter that is required when using LibreOffice7. Pass in None if you do not want to apply any filter. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 44daf16bae..b7b7ad83c6 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -107,8 +107,9 @@ def convert_and_partition_docx( Determines whether or not metadata is included in the metadata attribute on the elements in the output. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. @@ -187,8 +188,9 @@ def partition_docx( metadata_last_modified The last modified date for the document. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 2c83f99426..b2e4105003 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -294,8 +294,9 @@ def partition_email( The minimum number of characters to include in a partition. Only applies if processing the text/plain content. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index a5d2441ec0..d818f3bb87 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -49,8 +49,9 @@ def partition_epub( encoding The encoding method used to decode the text input. If None, utf-8 will be used. languages - Detected language of a text using naive Bayesian filter. Multiple languages indicates text - could be in either language. + User defined value for `metadata.languages` if provided. Otherwise language is detected + using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be + in either language. Additional Parameters: detect_language_per_element Detect language per element instead of at the document level. diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 4aa67d2ee5..639906cf70 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -82,8 +82,9 @@ def partition_html( skip_headers_and_footers If True, ignores any content that is within
or