pola-rs · stinodego · Oct 9, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 8, 2023
@@ -265,7 +265,7 @@ impl LogicalPlanBuilder {
     #[cfg(feature = "csv")]
     pub fn scan_csv<P: Into<std::path::PathBuf>>(
         path: P,
-        delimiter: u8,
+        delimiter_char: u8,
         has_header: bool,
         ignore_errors: bool,
         mut skip_rows: usize,
@@ -314,7 +314,7 @@ impl LogicalPlanBuilder {
         // this needs a way to estimated bytes/rows.
         let (mut inferred_schema, rows_read, bytes_read) = infer_file_schema(
             &reader_bytes,
-            delimiter,
+            delimiter_char,
             infer_schema_length,
             has_header,
             schema_overwrite,
@@ -368,7 +368,7 @@ impl LogicalPlanBuilder {
             scan_type: FileScan::Csv {
                 options: CsvParserOptions {
                     has_header,
-                    delimiter,
+                    delimiter: delimiter_char,
                     ignore_errors,
                     skip_rows,
                     low_memory,

@@ -55,6 +55,7 @@
 from polars.exceptions import NoRowsReturnedError, TooManyRowsReturnedError
 from polars.functions import col, lit
 from polars.io._utils import _is_glob_pattern, _is_local_file
+from polars.io.csv._utils import _check_arg_is_1byte
 from polars.io.spreadsheet._write_utils import (
     _unpack_multi_column_dict,
     _xl_apply_conditional_formats,
@@ -655,9 +656,9 @@ def _read_csv(
         *,
         has_header: bool = True,
         columns: Sequence[int] | Sequence[str] | None = None,
-        separator: str = ",",
+        delimiter_char: str = ",",
         comment_char: str | None = None,
-        quote_char: str | None = r'"',
+        quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
         schema: None | SchemaDict = None,
@@ -734,7 +735,7 @@ def _read_csv(
             scan = scan_csv(
                 source,
                 has_header=has_header,
-                separator=separator,
+                delimiter_char=delimiter_char,
                 comment_char=comment_char,
                 quote_char=quote_char,
                 skip_rows=skip_rows,
@@ -775,7 +776,7 @@ def _read_csv(
             n_rows,
             skip_rows,
             projection,
-            separator,
+            ord(delimiter_char),
             rechunk,
             columns,
             encoding,
@@ -784,15 +785,15 @@ def _read_csv(
             dtype_list,
             dtype_slice,
             low_memory,
-            comment_char,
-            quote_char,
+            ord(comment_char) if comment_char else None,
+            ord(quote_char) if quote_char else None,
             processed_null_values,
             missing_utf8_is_empty_string,
             try_parse_dates,
             skip_rows_after_header,
             _prepare_row_count_args(row_count_name, row_count_offset),
             sample_size=sample_size,
-            eol_char=eol_char,
+            eol_char=ord(eol_char),
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
             schema=schema,
@@ -2441,9 +2442,9 @@ def write_csv(
         file: None = None,
         *,
         has_header: bool = ...,
-        separator: str = ...,
+        delimiter_char: str = ...,
         line_terminator: str = ...,
-        quote: str = ...,
+        quote_char: str = ...,
         batch_size: int = ...,
         datetime_format: str | None = ...,
         date_format: str | None = ...,
@@ -2460,9 +2461,9 @@ def write_csv(
         file: BytesIO | TextIOWrapper | str | Path,
         *,
         has_header: bool = ...,
-        separator: str = ...,
+        delimiter_char: str = ...,
         line_terminator: str = ...,
-        quote: str = ...,
+        quote_char: str = ...,
         batch_size: int = ...,
         datetime_format: str | None = ...,
         date_format: str | None = ...,
@@ -2478,9 +2479,9 @@ def write_csv(
         file: BytesIO | TextIOWrapper | str | Path | None = None,
         *,
         has_header: bool = True,
-        separator: str = ",",
+        delimiter_char: str = ",",
         line_terminator: str = "\n",
-        quote: str = '"',
+        quote_char: str = '"',
         batch_size: int = 1024,
         datetime_format: str | None = None,
         date_format: str | None = None,
@@ -2499,11 +2500,11 @@ def write_csv(
             (default), the output is returned as a string instead.
         has_header
             Whether to include header in the CSV output.
-        separator
+        delimiter_char
             Separate CSV fields with this symbol.
         line_terminator
             String used to end each row.
-        quote
+        quote_char
             Byte to use as quoting character.
         batch_size
             Number of rows that will be processed per thread.
@@ -2555,13 +2556,11 @@ def write_csv(
         ...     }
         ... )
         >>> path: pathlib.Path = dirpath / "new_file.csv"
-        >>> df.write_csv(path, separator=",")
+        >>> df.write_csv(path, delimiter_char=",")
 
         """
-        if len(separator) != 1:
-            raise ValueError("only single byte separator is allowed")
-        if len(quote) != 1:
-            raise ValueError("only single byte quote char is allowed")
+        _check_arg_is_1byte("delimiter_char", delimiter_char, can_be_empty=False)
+        _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
         if not null_value:
             null_value = None
 
@@ -2577,9 +2576,9 @@ def write_csv(
         self._df.write_csv(
             file,
             has_header,
-            ord(separator),
+            ord(delimiter_char),
             line_terminator,
-            ord(quote),
+            ord(quote_char),
             batch_size,
             datetime_format,
             date_format,

@@ -31,9 +31,9 @@ def __init__(
         *,
         has_header: bool = True,
         columns: Sequence[int] | Sequence[str] | None = None,
-        separator: str = ",",
+        delimiter_char: str = ",",
         comment_char: str | None = None,
-        quote_char: str | None = r'"',
+        quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
         null_values: str | Sequence[str] | dict[str, str] | None = None,
@@ -83,7 +83,7 @@ def __init__(
             n_rows=n_rows,
             skip_rows=skip_rows,
             projection=projection,
-            separator=separator,
+            delimiter_char=delimiter_char,
             rechunk=rechunk,
             columns=columns,
             encoding=encoding,
@@ -123,7 +123,7 @@ def next_batches(self, n: int) -> list[DataFrame] | None:
         --------
         >>> reader = pl.read_csv_batched(
         ...     "./tpch/tables_scale_100/lineitem.tbl",
-        ...     separator="|",
+        ...     delimiter_char="|",
         ...     try_parse_dates=True,
         ... )  # doctest: +SKIP
         >>> reader.next_batches(5)  # doctest: +SKIP

@@ -23,9 +23,9 @@ def read_csv(
     has_header: bool = True,
     columns: Sequence[int] | Sequence[str] | None = None,
     new_columns: Sequence[str] | None = None,
-    separator: str = ",",
+    delimiter_char: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
     schema: SchemaDict | None = None,
@@ -50,7 +50,7 @@ def read_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
 ) -> DataFrame:
-    """
+    r"""
     Read a CSV file into a DataFrame.
 
     Parameters
@@ -72,7 +72,7 @@ def read_csv(
         Rename columns right after parsing the CSV file. If the given
         list is shorter than the width of the DataFrame the remaining
         columns will have their original name.
-    separator
+    delimiter_char
         Single byte character to use as delimiter in the file.
     comment_char
         Single byte character that indicates the start of a comment line,
@@ -159,7 +159,9 @@ def read_csv(
         Set the sample size. This is used to sample statistics to estimate the
         allocation needed.
     eol_char
-        Single byte end of line character.
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -182,7 +184,7 @@ def read_csv(
     an expensive operation.
 
     """
-    _check_arg_is_1byte("separator", separator, can_be_empty=False)
+    _check_arg_is_1byte("delimiter_char", delimiter_char, can_be_empty=False)
     _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False)
     _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
     _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False)
@@ -239,7 +241,7 @@ def read_csv(
                         encoding=encoding,
                     ),
                     pa.csv.ParseOptions(
-                        delimiter=separator,
+                        delimiter=delimiter_char,
                         quote_char=quote_char if quote_char else False,
                         double_quote=quote_char is not None and quote_char == '"',
                     ),
@@ -365,7 +367,7 @@ def read_csv(
             data,
             has_header=has_header,
             columns=columns if columns else projection,
-            separator=separator,
+            delimiter_char=delimiter_char,
             comment_char=comment_char,
             quote_char=quote_char,
             skip_rows=skip_rows,
@@ -402,9 +404,9 @@ def read_csv_batched(
     has_header: bool = True,
     columns: Sequence[int] | Sequence[str] | None = None,
     new_columns: Sequence[str] | None = None,
-    separator: str = ",",
+    delimiter_char: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
     null_values: str | Sequence[str] | dict[str, str] | None = None,
@@ -425,7 +427,7 @@ def read_csv_batched(
     eol_char: str = "\n",
     raise_if_empty: bool = True,
 ) -> BatchedCsvReader:
-    """
+    r"""
     Read a CSV file in batches.
 
     Upon creation of the ``BatchedCsvReader``, Polars will gather statistics and
@@ -451,7 +453,7 @@ def read_csv_batched(
         Rename columns right after parsing the CSV file. If the given
         list is shorter than the width of the DataFrame the remaining
         columns will have their original name.
-    separator
+    delimiter_char
         Single byte character to use as delimiter in the file.
     comment_char
         Single byte character that indicates the start of a comment line,
@@ -517,7 +519,9 @@ def read_csv_batched(
         Set the sample size. This is used to sample statistics to estimate the
         allocation needed.
     eol_char
-        Single byte end of line character.
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, ``None`` will be returned from ``next_batches(n)`` instead.
@@ -533,7 +537,9 @@ def read_csv_batched(
     Examples
     --------
     >>> reader = pl.read_csv_batched(
-    ...     "./tpch/tables_scale_100/lineitem.tbl", separator="|", try_parse_dates=True
+    ...     "./tpch/tables_scale_100/lineitem.tbl",
+    ...     delimiter_char="|",
+    ...     try_parse_dates=True,
     ... )  # doctest: +SKIP
     >>> batches = reader.next_batches(5)  # doctest: +SKIP
     >>> for df in batches:  # doctest: +SKIP
@@ -662,7 +668,7 @@ def read_csv_batched(
         source,
         has_header=has_header,
         columns=columns if columns else projection,
-        separator=separator,
+        delimiter_char=delimiter_char,
         comment_char=comment_char,
         quote_char=quote_char,
         skip_rows=skip_rows,
@@ -692,9 +698,9 @@ def scan_csv(
     source: str | Path,
     *,
     has_header: bool = True,
-    separator: str = ",",
+    delimiter_char: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: SchemaDict | Sequence[PolarsDataType] | None = None,
     schema: SchemaDict | None = None,
@@ -717,7 +723,7 @@ def scan_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
 ) -> LazyFrame:
-    """
+    r"""
     Lazily read from a CSV file or multiple files via glob patterns.
 
     This allows the query optimizer to push down predicates and
@@ -733,7 +739,7 @@ def scan_csv(
         If set to False, column names will be autogenerated in the
         following format: ``column_x``, with ``x`` being an
         enumeration over every column in the dataset starting at 1.
-    separator
+    delimiter_char
         Single byte character to use as delimiter in the file.
     comment_char
         Single byte character that indicates the start of a comment line,
@@ -796,7 +802,9 @@ def scan_csv(
         can be inferred, as well as a handful of others. If this does not succeed,
         the column remains of data type ``pl.Utf8``.
     eol_char
-        Single byte end of line character
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     new_columns
         Provide an explicit list of string column names to use (for example, when
         scanning a headerless CSV file). If the given list is shorter than the width of
@@ -891,7 +899,7 @@ def with_column_names(cols: list[str]) -> list[str]:
             else:
                 return new_columns  # type: ignore[return-value]
 
-    _check_arg_is_1byte("separator", separator, can_be_empty=False)
+    _check_arg_is_1byte("delimiter_char", delimiter_char, can_be_empty=False)
     _check_arg_is_1byte("comment_char", comment_char, can_be_empty=False)
     _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
 
@@ -901,7 +909,7 @@ def with_column_names(cols: list[str]) -> list[str]:
     return pl.LazyFrame._scan_csv(
         source,
         has_header=has_header,
-        separator=separator,
+        delimiter_char=delimiter_char,
         comment_char=comment_char,
         quote_char=quote_char,
         skip_rows=skip_rows,