From b1c4048eee2207b6a254b4675e01ebaf579fce93 Mon Sep 17 00:00:00 2001
From: coniferish <johnjennings702@gmail.com>
Date: Wed, 4 Oct 2023 12:12:12 -0500
Subject: [PATCH] improve docstrings and remove detect_per_element kwarg from
 partitioners that only return one element (xlsx, csv, tsv)

---
 unstructured/partition/csv.py   | 15 +++------------
 unstructured/partition/doc.py   |  5 +++--
 unstructured/partition/docx.py  | 10 ++++++----
 unstructured/partition/email.py |  5 +++--
 unstructured/partition/epub.py  |  5 +++--
 unstructured/partition/html.py  | 10 ++++++----
 unstructured/partition/md.py    |  5 +++--
 unstructured/partition/msg.py   |  5 +++--
 unstructured/partition/odt.py   |  5 +++--
 unstructured/partition/org.py   |  5 +++--
 unstructured/partition/ppt.py   |  5 +++--
 unstructured/partition/pptx.py  |  5 +++--
 unstructured/partition/rst.py   |  5 +++--
 unstructured/partition/rtf.py   |  5 +++--
 unstructured/partition/text.py  |  5 +++--
 unstructured/partition/tsv.py   | 10 +++-------
 unstructured/partition/xlsx.py  | 10 +++-------
 unstructured/partition/xml.py   |  5 +++--
 18 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 837326dfb9..a38c29c728 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -31,7 +31,6 @@ def partition_csv(
     metadata_last_modified: Optional[str] = None,
     include_metadata: bool = True,
     languages: List[str] = ["auto"],
-    detect_language_per_element: bool = False,
     **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
@@ -49,11 +48,9 @@ def partition_csv(
     include_metadata
         Determines whether or not metadata is included in the output.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
     """
     exactly_one(filename=filename, file=file)
 
@@ -75,11 +72,6 @@ def partition_csv(
 
     html_text = table.to_html(index=False, header=False, na_rep="")
     text = soupparser_fromstring(html_text).text_content()
-    # languages = detect_languages(
-    #     text=text,
-    #     languages=languages,
-    #     detect_language_per_element=detect_language_per_element,
-    # )
 
     if include_metadata:
         metadata = ElementMetadata(
@@ -95,6 +87,5 @@ def partition_csv(
         apply_lang_metadata(
             [Table(text=text, metadata=metadata)],
             languages=languages,
-            detect_language_per_element=detect_language_per_element,
         ),
     )
diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py
index 07b34bd496..9a7b991591 100644
--- a/unstructured/partition/doc.py
+++ b/unstructured/partition/doc.py
@@ -45,8 +45,9 @@ def partition_doc(
         filter that is required when using LibreOffice7. Pass in None
         if you do not want to apply any filter.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index 44daf16bae..b7b7ad83c6 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -107,8 +107,9 @@ def convert_and_partition_docx(
         Determines whether or not metadata is included in the metadata attribute on the elements in
         the output.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
@@ -187,8 +188,9 @@ def partition_docx(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
index 2c83f99426..b2e4105003 100644
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@@ -294,8 +294,9 @@ def partition_email(
         The minimum number of characters to include in a partition. Only applies if
         processing the text/plain content.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py
index a5d2441ec0..d818f3bb87 100644
--- a/unstructured/partition/epub.py
+++ b/unstructured/partition/epub.py
@@ -49,8 +49,9 @@ def partition_epub(
     encoding
         The encoding method used to decode the text input. If None, utf-8 will be used.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
index 4aa67d2ee5..639906cf70 100644
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@@ -82,8 +82,9 @@ def partition_html(
     skip_headers_and_footers
         If True, ignores any content that is within <header> or <footer> tags
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
@@ -183,8 +184,9 @@ def convert_and_partition_html(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py
index 0e2173e1a9..f3d3fab4ea 100644
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@@ -60,8 +60,9 @@ def partition_md(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
index 88b797c499..9951c08477 100644
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@@ -57,8 +57,9 @@ def partition_msg(
         The minimum number of characters to include in a partition. Only applies if
         processing text/plain content.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py
index 943968bf0b..da622bf131 100644
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@@ -35,8 +35,9 @@ def partition_odt(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py
index feb3618a01..51c965c685 100644
--- a/unstructured/partition/org.py
+++ b/unstructured/partition/org.py
@@ -33,8 +33,9 @@ def partition_org(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py
index 19f639ff7f..c1ecddfcd6 100644
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@@ -42,8 +42,9 @@ def partition_ppt(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
index b8c2a64f8d..bebdde4f6b 100644
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@@ -78,8 +78,9 @@ def partition_pptx(
     include_slide_notes
         If True, includes the slide notes as element
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py
index 916dc5f9b5..4b074cd871 100644
--- a/unstructured/partition/rst.py
+++ b/unstructured/partition/rst.py
@@ -35,8 +35,9 @@ def partition_rst(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py
index 6990a4a115..116ad34628 100644
--- a/unstructured/partition/rtf.py
+++ b/unstructured/partition/rtf.py
@@ -35,8 +35,9 @@ def partition_rtf(
     metadata_last_modified
         The last modified date for the document.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
index 9afbff0b3c..d7ef627ac8 100644
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@@ -195,8 +195,9 @@ def partition_text(
     include_metadata
         Determines whether or not metadata is included in the output.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
index f91aa3ddb7..9a02ee4f1f 100644
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@@ -29,7 +29,6 @@ def partition_tsv(
     metadata_last_modified: Optional[str] = None,
     include_metadata: bool = True,
     languages: List[str] = ["auto"],
-    detect_language_per_element: bool = False,
     **kwargs,
 ) -> List[Element]:
     """Partitions TSV files into document elements.
@@ -45,11 +44,9 @@ def partition_tsv(
     metadata_last_modified
         The day of the last modification.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
     """
     exactly_one(filename=filename, file=file)
 
@@ -86,6 +83,5 @@ def partition_tsv(
         apply_lang_metadata(
             [Table(text=text, metadata=metadata)],
             languages=languages,
-            detect_language_per_element=detect_language_per_element,
         ),
     )
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index b9fc596919..0cf1e1f4de 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -32,7 +32,6 @@ def partition_xlsx(
     metadata_last_modified: Optional[str] = None,
     include_header: bool = True,
     languages: List[str] = ["auto"],
-    detect_language_per_element: bool = False,
     **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Excel Documents in .xlsx format into its document elements.
@@ -50,11 +49,9 @@ def partition_xlsx(
     include_header
         Determines whether or not header info info is included in text and medatada.text_as_html
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
     """
     exactly_one(filename=filename, file=file)
 
@@ -100,6 +97,5 @@ def partition_xlsx(
         apply_lang_metadata(
             elements=elements,
             languages=languages,
-            detect_language_per_element=detect_language_per_element,
         ),
     )
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
index 56235a6962..a9be35afb4 100644
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@@ -117,8 +117,9 @@ def partition_xml(
     metadata_last_modified
         The day of the last modification.
     languages
-        Detected language of a text using naive Bayesian filter. Multiple languages indicates text
-        could be in either language.
+        User defined value for `metadata.languages` if provided. Otherwise language is detected
+        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
+        in either language.
         Additional Parameters:
             detect_language_per_element
                 Detect language per element instead of at the document level.