pola-rs · stinodego · Oct 9, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 8, 2023
@@ -265,7 +265,7 @@ impl LogicalPlanBuilder {
     #[cfg(feature = "csv")]
     pub fn scan_csv<P: Into<std::path::PathBuf>>(
         path: P,
-        delimiter: u8,
+        separator: u8,
         has_header: bool,
         ignore_errors: bool,
         mut skip_rows: usize,
@@ -314,7 +314,7 @@ impl LogicalPlanBuilder {
         // this needs a way to estimated bytes/rows.
         let (mut inferred_schema, rows_read, bytes_read) = infer_file_schema(
             &reader_bytes,
-            delimiter,
+            separator,
             infer_schema_length,
             has_header,
             schema_overwrite,
@@ -368,7 +368,7 @@ impl LogicalPlanBuilder {
             scan_type: FileScan::Csv {
                 options: CsvParserOptions {
                     has_header,
-                    delimiter,
+                    delimiter: separator,
                     ignore_errors,
                     skip_rows,
                     low_memory,

@@ -55,6 +55,7 @@
 from polars.exceptions import NoRowsReturnedError, TooManyRowsReturnedError
 from polars.functions import col, lit
 from polars.io._utils import _is_glob_pattern, _is_local_file
+from polars.io.csv._utils import _check_arg_is_1byte
 from polars.io.spreadsheet._write_utils import (
     _unpack_multi_column_dict,
     _xl_apply_conditional_formats,
@@ -657,7 +658,7 @@ def _read_csv(
         columns: Sequence[int] | Sequence[str] | None = None,
         separator: str = ",",
         comment_char: str | None = None,
-        quote_char: str | None = r'"',
+        quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
         schema: None | SchemaDict = None,
@@ -775,7 +776,7 @@ def _read_csv(
             n_rows,
             skip_rows,
             projection,
-            separator,
+            ord(separator),
             rechunk,
             columns,
             encoding,
@@ -784,15 +785,15 @@ def _read_csv(
             dtype_list,
             dtype_slice,
             low_memory,
-            comment_char,
-            quote_char,
+            ord(comment_char) if comment_char else None,
+            ord(quote_char) if quote_char else None,
             processed_null_values,
             missing_utf8_is_empty_string,
             try_parse_dates,
             skip_rows_after_header,
             _prepare_row_count_args(row_count_name, row_count_offset),
             sample_size=sample_size,
-            eol_char=eol_char,
+            eol_char=ord(eol_char),
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
             schema=schema,
@@ -2443,7 +2444,7 @@ def write_csv(
         has_header: bool = ...,
         separator: str = ...,
         line_terminator: str = ...,
-        quote: str = ...,
+        quote_char: str = ...,
         batch_size: int = ...,
         datetime_format: str | None = ...,
         date_format: str | None = ...,
@@ -2462,7 +2463,7 @@ def write_csv(
         has_header: bool = ...,
         separator: str = ...,
         line_terminator: str = ...,
-        quote: str = ...,
+        quote_char: str = ...,
         batch_size: int = ...,
         datetime_format: str | None = ...,
         date_format: str | None = ...,
@@ -2473,14 +2474,15 @@ def write_csv(
     ) -> None:
         ...
 
+    @deprecate_renamed_parameter("quote", "quote_char", version="0.19.7")
     def write_csv(
         self,
         file: BytesIO | TextIOWrapper | str | Path | None = None,
         *,
         has_header: bool = True,
         separator: str = ",",
         line_terminator: str = "\n",
-        quote: str = '"',
+        quote_char: str = '"',
         batch_size: int = 1024,
         datetime_format: str | None = None,
         date_format: str | None = None,
@@ -2503,7 +2505,7 @@ def write_csv(
             Separate CSV fields with this symbol.
         line_terminator
             String used to end each row.
-        quote
+        quote_char
             Byte to use as quoting character.
         batch_size
             Number of rows that will be processed per thread.
@@ -2558,10 +2560,8 @@ def write_csv(
         >>> df.write_csv(path, separator=",")
 
         """
-        if len(separator) != 1:
-            raise ValueError("only single byte separator is allowed")
-        if len(quote) != 1:
-            raise ValueError("only single byte quote char is allowed")
+        _check_arg_is_1byte("separator", separator, can_be_empty=False)
+        _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
         if not null_value:
             null_value = None
 
@@ -2579,7 +2579,7 @@ def write_csv(
             has_header,
             ord(separator),
             line_terminator,
-            ord(quote),
+            ord(quote_char),
             batch_size,
             datetime_format,
             date_format,

@@ -33,7 +33,7 @@ def __init__(
         columns: Sequence[int] | Sequence[str] | None = None,
         separator: str = ",",
         comment_char: str | None = None,
-        quote_char: str | None = r'"',
+        quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None,
         null_values: str | Sequence[str] | dict[str, str] | None = None,

@@ -25,7 +25,7 @@ def read_csv(
     new_columns: Sequence[str] | None = None,
     separator: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
     schema: SchemaDict | None = None,
@@ -50,7 +50,7 @@ def read_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
 ) -> DataFrame:
-    """
+    r"""
     Read a CSV file into a DataFrame.
 
     Parameters
@@ -159,7 +159,9 @@ def read_csv(
         Set the sample size. This is used to sample statistics to estimate the
         allocation needed.
     eol_char
-        Single byte end of line character.
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -404,7 +406,7 @@ def read_csv_batched(
     new_columns: Sequence[str] | None = None,
     separator: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None,
     null_values: str | Sequence[str] | dict[str, str] | None = None,
@@ -425,7 +427,7 @@ def read_csv_batched(
     eol_char: str = "\n",
     raise_if_empty: bool = True,
 ) -> BatchedCsvReader:
-    """
+    r"""
     Read a CSV file in batches.
 
     Upon creation of the ``BatchedCsvReader``, Polars will gather statistics and
@@ -517,7 +519,9 @@ def read_csv_batched(
         Set the sample size. This is used to sample statistics to estimate the
         allocation needed.
     eol_char
-        Single byte end of line character.
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     raise_if_empty
         When there is no data in the source,``NoDataError`` is raised. If this parameter
         is set to False, ``None`` will be returned from ``next_batches(n)`` instead.
@@ -533,7 +537,9 @@ def read_csv_batched(
     Examples
     --------
     >>> reader = pl.read_csv_batched(
-    ...     "./tpch/tables_scale_100/lineitem.tbl", separator="|", try_parse_dates=True
+    ...     "./tpch/tables_scale_100/lineitem.tbl",
+    ...     separator="|",
+    ...     try_parse_dates=True,
     ... )  # doctest: +SKIP
     >>> batches = reader.next_batches(5)  # doctest: +SKIP
     >>> for df in batches:  # doctest: +SKIP
@@ -694,7 +700,7 @@ def scan_csv(
     has_header: bool = True,
     separator: str = ",",
     comment_char: str | None = None,
-    quote_char: str | None = r'"',
+    quote_char: str | None = '"',
     skip_rows: int = 0,
     dtypes: SchemaDict | Sequence[PolarsDataType] | None = None,
     schema: SchemaDict | None = None,
@@ -717,7 +723,7 @@ def scan_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
 ) -> LazyFrame:
-    """
+    r"""
     Lazily read from a CSV file or multiple files via glob patterns.
 
     This allows the query optimizer to push down predicates and
@@ -796,7 +802,9 @@ def scan_csv(
         can be inferred, as well as a handful of others. If this does not succeed,
         the column remains of data type ``pl.Utf8``.
     eol_char
-        Single byte end of line character
+        Single byte end of line character (default: `\n`). When encountering a file
+        with windows line endings (`\r\n`), one can go with the default `\n`. The extra
+        `\r` will be removed when processed.
     new_columns
         Provide an explicit list of string column names to use (for example, when
         scanning a headerless CSV file). If the given list is shorter than the width of

@@ -46,6 +46,7 @@
 )
 from polars.dependencies import dataframe_api_compat, subprocess
 from polars.io._utils import _is_local_file, _is_supported_cloud
+from polars.io.csv._utils import _check_arg_is_1byte
 from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec
 from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec
 from polars.lazyframe.group_by import LazyGroupBy
@@ -316,7 +317,7 @@ def _scan_csv(
         has_header: bool = True,
         separator: str = ",",
         comment_char: str | None = None,
-        quote_char: str | None = r'"',
+        quote_char: str | None = '"',
         skip_rows: int = 0,
         dtypes: SchemaDict | None = None,
         schema: SchemaDict | None = None,
@@ -358,16 +359,16 @@ def _scan_csv(
         self = cls.__new__(cls)
         self._ldf = PyLazyFrame.new_from_csv(
             source,
-            separator,
+            ord(separator),
             has_header,
             ignore_errors,
             skip_rows,
             n_rows,
             cache,
             dtype_list,
             low_memory,
-            comment_char,
-            quote_char,
+            ord(comment_char) if comment_char else None,
+            ord(quote_char) if quote_char else None,
             processed_null_values,
             missing_utf8_is_empty_string,
             infer_schema_length,
@@ -377,7 +378,7 @@ def _scan_csv(
             encoding,
             _prepare_row_count_args(row_count_name, row_count_offset),
             try_parse_dates,
-            eol_char=eol_char,
+            eol_char=ord(eol_char),
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
             schema=schema,
@@ -2026,14 +2027,15 @@ def sink_ipc(
             maintain_order=maintain_order,
         )
 
+    @deprecate_renamed_parameter("quote", "quote_char", version="0.19.7")
     def sink_csv(
         self,
         path: str | Path,
         *,
         has_header: bool = True,
         separator: str = ",",
         line_terminator: str = "\n",
-        quote: str = '"',
+        quote_char: str = '"',
         batch_size: int = 1024,
         datetime_format: str | None = None,
         date_format: str | None = None,
@@ -2064,7 +2066,7 @@ def sink_csv(
             Separate CSV fields with this symbol.
         line_terminator
             String used to end each row.
-        quote
+        quote_char
             Byte to use as quoting character.
         batch_size
             Number of rows that will be processed per thread.
@@ -2097,7 +2099,8 @@ def sink_csv(
             This is the default.
             - always: This puts quotes around every field. Always.
             - never: This never puts quotes around fields, even if that results in
-            invalid CSV data (e.g.: by not quoting strings containing the separator).
+            invalid CSV data (e.g.: by not quoting strings containing the
+            separator).
             - non_numeric: This puts quotes around all fields that are non-numeric.
             Namely, when writing a field that does not parse as a valid float
             or integer, then quotes will be used even if they aren`t strictly
@@ -2128,10 +2131,8 @@ def sink_csv(
         >>> lf.sink_csv("out.csv")  # doctest: +SKIP
 
         """
-        if len(separator) != 1:
-            raise ValueError("only single byte separator is allowed")
-        if len(quote) != 1:
-            raise ValueError("only single byte quote char is allowed")
+        _check_arg_is_1byte("separator", separator, can_be_empty=False)
+        _check_arg_is_1byte("quote_char", quote_char, can_be_empty=False)
         if not null_value:
             null_value = None
 
@@ -2149,7 +2150,7 @@ def sink_csv(
             has_header=has_header,
             separator=ord(separator),
             line_terminator=line_terminator,
-            quote=ord(quote),
+            quote_char=ord(quote_char),
             batch_size=batch_size,
             datetime_format=datetime_format,
             date_format=date_format,