From 3564a77893b19cb72448bcf5846a1b68d821ab2d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 27 Apr 2024 10:06:06 +0200 Subject: [PATCH] feat(rust!): Rename to `CsvParserOptions` to `CsvReaderOptions`, use in `CsvReader` (#15919) --- crates/polars-io/src/csv/read/mod.rs | 2 +- crates/polars-io/src/csv/read/options.rs | 29 +- crates/polars-io/src/csv/read/reader.rs | 380 ++++++++---------- .../src/physical_plan/executors/scan/csv.rs | 2 +- .../polars-pipe/src/executors/sources/csv.rs | 6 +- .../src/logical_plan/builder_dsl.rs | 4 +- .../src/logical_plan/conversion/scans.rs | 2 +- .../polars-plan/src/logical_plan/file_scan.rs | 4 +- py-polars/polars/io/csv/functions.py | 12 +- 9 files changed, 207 insertions(+), 234 deletions(-) diff --git a/crates/polars-io/src/csv/read/mod.rs b/crates/polars-io/src/csv/read/mod.rs index 3fad37ce049a..5f5b93948f02 100644 --- a/crates/polars-io/src/csv/read/mod.rs +++ b/crates/polars-io/src/csv/read/mod.rs @@ -26,7 +26,7 @@ mod reader; mod splitfields; mod utils; -pub use options::{CommentPrefix, CsvEncoding, CsvParserOptions, NullValues}; +pub use options::{CommentPrefix, CsvEncoding, CsvReaderOptions, NullValues}; pub use parser::count_rows; pub use read_impl::batched_mmap::{BatchedCsvReaderMmap, OwnedBatchedCsvReaderMmap}; pub use read_impl::batched_read::{BatchedCsvReaderRead, OwnedBatchedCsvReader}; diff --git a/crates/polars-io/src/csv/read/options.rs b/crates/polars-io/src/csv/read/options.rs index f3b4f26ef1cc..3741bd6d9e47 100644 --- a/crates/polars-io/src/csv/read/options.rs +++ b/crates/polars-io/src/csv/read/options.rs @@ -5,11 +5,11 @@ use serde::{Deserialize, Serialize}; #[derive(Clone, Debug, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct CsvParserOptions { +pub struct CsvReaderOptions { pub has_header: bool, pub separator: u8, - pub comment_prefix: Option, pub quote_char: Option, + pub comment_prefix: Option, pub eol_char: u8, pub encoding: CsvEncoding, pub skip_rows: usize, @@ -27,13 +27,13 @@ pub struct CsvParserOptions { pub low_memory: bool, } -impl Default for CsvParserOptions { +impl Default for CsvReaderOptions { fn default() -> Self { Self { has_header: true, separator: b',', - comment_prefix: None, quote_char: Some(b'"'), + comment_prefix: None, eol_char: b'\n', encoding: CsvEncoding::default(), skip_rows: 0, @@ -75,17 +75,22 @@ pub enum CommentPrefix { impl CommentPrefix { /// Creates a new `CommentPrefix` for the `Single` variant. - pub fn new_single(c: u8) -> Self { - CommentPrefix::Single(c) + pub fn new_single(prefix: u8) -> Self { + CommentPrefix::Single(prefix) + } + + /// Creates a new `CommentPrefix` for the `Multi` variant. + pub fn new_multi(prefix: String) -> Self { + CommentPrefix::Multi(prefix) } - /// Creates a new `CommentPrefix`. If `Multi` variant is used and the string is longer - /// than 5 characters, it will return `None`. - pub fn new_multi(s: String) -> Option { - if s.len() <= 5 { - Some(CommentPrefix::Multi(s)) + /// Creates a new `CommentPrefix` from a `&str`. + pub fn new_from_str(prefix: &str) -> Self { + if prefix.len() == 1 && prefix.chars().next().unwrap().is_ascii() { + let c = prefix.as_bytes()[0]; + CommentPrefix::Single(c) } else { - None + CommentPrefix::Multi(prefix.to_string()) } } } diff --git a/crates/polars-io/src/csv/read/reader.rs b/crates/polars-io/src/csv/read/reader.rs index ffe5c1f01353..65fcff4b3c47 100644 --- a/crates/polars-io/src/csv/read/reader.rs +++ b/crates/polars-io/src/csv/read/reader.rs @@ -8,7 +8,7 @@ use polars_time::prelude::*; use rayon::prelude::*; use super::infer_file_schema; -use super::options::{CommentPrefix, CsvEncoding, NullValues}; +use super::options::{CommentPrefix, CsvEncoding, CsvReaderOptions, NullValues}; use super::read_impl::batched_mmap::{ to_batched_owned_mmap, BatchedCsvReaderMmap, OwnedBatchedCsvReaderMmap, }; @@ -42,43 +42,25 @@ pub struct CsvReader<'a, R> where R: MmapBytesReader, { - /// File or Stream object + /// File or Stream object. reader: R, + /// Options for the CSV reader. + options: CsvReaderOptions, /// Stop reading from the csv after this number of rows is reached n_rows: Option, - // used by error ignore logic - max_records: Option, - skip_rows_before_header: usize, /// Optional indexes of the columns to project projection: Option>, /// Optional column names to project/ select. columns: Option>, - separator: Option, - pub(crate) schema: Option, - encoding: CsvEncoding, - n_threads: Option, path: Option, - schema_overwrite: Option, dtype_overwrite: Option<&'a [DataType]>, sample_size: usize, chunk_size: usize, - comment_prefix: Option, - null_values: Option, predicate: Option>, - quote_char: Option, - skip_rows_after_header: usize, - try_parse_dates: bool, row_index: Option, /// Aggregates chunk afterwards to a single chunk. rechunk: bool, - raise_if_empty: bool, - truncate_ragged_lines: bool, missing_is_null: bool, - low_memory: bool, - has_header: bool, - ignore_errors: bool, - eol_char: u8, - decimal_comma: bool, } impl<'a, R> CsvReader<'a, R> @@ -86,39 +68,62 @@ where R: 'a + MmapBytesReader, { /// Skip these rows after the header - pub fn with_skip_rows_after_header(mut self, offset: usize) -> Self { - self.skip_rows_after_header = offset; + pub fn with_options(mut self, options: CsvReaderOptions) -> Self { + self.options = options; self } - /// Add a row index column. - pub fn with_row_index(mut self, row_index: Option) -> Self { - self.row_index = row_index; + /// Sets whether the CSV file has headers + pub fn has_header(mut self, has_header: bool) -> Self { + self.options.has_header = has_header; self } - /// Sets the chunk size used by the parser. This influences performance - pub fn with_chunk_size(mut self, chunk_size: usize) -> Self { - self.chunk_size = chunk_size; + /// Sets the CSV file's column separator as a byte character + pub fn with_separator(mut self, separator: u8) -> Self { + self.options.separator = separator; self } - /// Set [`CsvEncoding`] - pub fn with_encoding(mut self, enc: CsvEncoding) -> Self { - self.encoding = enc; + /// Sets the `char` used as quote char. The default is `b'"'`. If set to [`None`], quoting is disabled. + pub fn with_quote_char(mut self, quote_char: Option) -> Self { + self.options.quote_char = quote_char; self } - /// Try to stop parsing when `n` rows are parsed. During multithreaded parsing the upper bound `n` cannot - /// be guaranteed. - pub fn with_n_rows(mut self, num_rows: Option) -> Self { - self.n_rows = num_rows; + /// Sets the comment prefix for this instance. Lines starting with this prefix will be ignored. + pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self { + self.options.comment_prefix = comment_prefix.map(CommentPrefix::new_from_str); self } - /// Continue with next batch when a ParserError is encountered. - pub fn with_ignore_errors(mut self, ignore: bool) -> Self { - self.ignore_errors = ignore; + /// Sets the comment prefix from `CsvParserOptions` for internal initialization. + pub fn _with_comment_prefix(mut self, comment_prefix: Option) -> Self { + self.options.comment_prefix = comment_prefix; + self + } + + /// Set the `char` used as end-of-line char. The default is `b'\n'`. + pub fn with_end_of_line_char(mut self, eol_char: u8) -> Self { + self.options.eol_char = eol_char; + self + } + + /// Set [`CsvEncoding`]. + pub fn with_encoding(mut self, encoding: CsvEncoding) -> Self { + self.options.encoding = encoding; + self + } + + /// Skip the first `n` rows during parsing. The header will be parsed at `n` lines. + pub fn with_skip_rows(mut self, n: usize) -> Self { + self.options.skip_rows = n; + self + } + + /// Skip these rows after the header + pub fn with_skip_rows_after_header(mut self, n: usize) -> Self { + self.options.skip_rows_after_header = n; self } @@ -127,74 +132,111 @@ where /// /// It is recommended to use [with_dtypes](Self::with_dtypes) instead. pub fn with_schema(mut self, schema: Option) -> Self { - self.schema = schema; + self.options.schema = schema; self } - /// Skip the first `n` rows during parsing. The header will be parsed at `n` lines. - pub fn with_skip_rows(mut self, skip_rows: usize) -> Self { - self.skip_rows_before_header = skip_rows; + /// Overwrite the schema with the dtypes in this given Schema. The given schema may be a subset + /// of the total schema. + pub fn with_dtypes(mut self, schema: Option) -> Self { + self.options.schema_overwrite = schema; self } - /// Rechunk the DataFrame to contiguous memory after the CSV is parsed. - pub fn with_rechunk(mut self, rechunk: bool) -> Self { - self.rechunk = rechunk; + /// Set the CSV reader to infer the schema of the file + /// + /// # Arguments + /// * `n` - Maximum number of rows read for schema inference. + /// Setting this to `None` will do a full table scan (slow). + pub fn infer_schema(mut self, n: Option) -> Self { + // used by error ignore logic + self.options.infer_schema_length = n; self } - /// Set whether the CSV file has headers - pub fn has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + /// Automatically try to parse dates/ datetimes and time. If parsing fails, columns remain of dtype `[DataType::String]`. + pub fn with_try_parse_dates(mut self, toggle: bool) -> Self { + self.options.try_parse_dates = toggle; self } - /// Set the CSV file's column separator as a byte character - pub fn with_separator(mut self, separator: u8) -> Self { - self.separator = Some(separator); + /// Set values that will be interpreted as missing/null. + /// + /// Note: any value you set as null value will not be escaped, so if quotation marks + /// are part of the null value you should include them. + pub fn with_null_values(mut self, null_values: Option) -> Self { + self.options.null_values = null_values; self } - /// Set the comment prefix for this instance. Lines starting with this prefix will be ignored. - pub fn with_comment_prefix(mut self, comment_prefix: Option<&str>) -> Self { - self.comment_prefix = comment_prefix.map(|s| { - if s.len() == 1 && s.chars().next().unwrap().is_ascii() { - CommentPrefix::Single(s.as_bytes()[0]) - } else { - CommentPrefix::Multi(s.to_string()) - } - }); + /// Continue with next batch when a ParserError is encountered. + pub fn with_ignore_errors(mut self, toggle: bool) -> Self { + self.options.ignore_errors = toggle; self } - /// Sets the comment prefix from `CsvParserOptions` for internal initialization. - pub fn _with_comment_prefix(mut self, comment_prefix: Option) -> Self { - self.comment_prefix = comment_prefix; + /// Raise an error if CSV is empty (otherwise return an empty frame) + pub fn raise_if_empty(mut self, toggle: bool) -> Self { + self.options.raise_if_empty = toggle; self } - pub fn with_end_of_line_char(mut self, eol_char: u8) -> Self { - self.eol_char = eol_char; + /// Truncate lines that are longer than the schema. + pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self { + self.options.truncate_ragged_lines = toggle; self } - /// Set values that will be interpreted as missing/ null. Note that any value you set as null value - /// will not be escaped, so if quotation marks are part of the null value you should include them. - pub fn with_null_values(mut self, null_values: Option) -> Self { - self.null_values = null_values; + /// Parse floats with a comma as decimal separator. + pub fn with_decimal_comma(mut self, toggle: bool) -> Self { + self.options.decimal_comma = toggle; self } - /// Treat missing fields as null. - pub fn with_missing_is_null(mut self, missing_is_null: bool) -> Self { - self.missing_is_null = missing_is_null; + /// Set the number of threads used in CSV reading. The default uses the number of cores of + /// your cpu. + /// + /// Note that this only works if this is initialized with `CsvReader::from_path`. + /// Note that the number of cores is the maximum allowed number of threads. + pub fn with_n_threads(mut self, n: Option) -> Self { + self.options.n_threads = n; self } - /// Overwrite the schema with the dtypes in this given Schema. The given schema may be a subset - /// of the total schema. - pub fn with_dtypes(mut self, schema: Option) -> Self { - self.schema_overwrite = schema; + /// Reduce memory consumption at the expense of performance + pub fn low_memory(mut self, toggle: bool) -> Self { + self.options.low_memory = toggle; + self + } + + /// Add a row index column. + pub fn with_row_index(mut self, row_index: Option) -> Self { + self.row_index = row_index; + self + } + + /// Sets the chunk size used by the parser. This influences performance + pub fn with_chunk_size(mut self, chunk_size: usize) -> Self { + self.chunk_size = chunk_size; + self + } + + /// Try to stop parsing when `n` rows are parsed. During multithreaded parsing the upper bound `n` cannot + /// be guaranteed. + pub fn with_n_rows(mut self, num_rows: Option) -> Self { + self.n_rows = num_rows; + self + } + + /// Rechunk the DataFrame to contiguous memory after the CSV is parsed. + pub fn with_rechunk(mut self, rechunk: bool) -> Self { + self.rechunk = rechunk; + self + } + + /// Treat missing fields as null. + pub fn with_missing_is_null(mut self, missing_is_null: bool) -> Self { + self.missing_is_null = missing_is_null; self } @@ -205,17 +247,6 @@ where self } - /// Set the CSV reader to infer the schema of the file - /// - /// # Arguments - /// * `max_records` - Maximum number of rows read for schema inference. - /// Setting this to `None` will do a full table scan (slow). - pub fn infer_schema(mut self, max_records: Option) -> Self { - // used by error ignore logic - self.max_records = max_records; - self - } - /// Set the reader's column projection. This counts from 0, meaning that /// `vec![0, 4]` would select the 1st and 5th column. pub fn with_projection(mut self, projection: Option>) -> Self { @@ -229,16 +260,6 @@ where self } - /// Set the number of threads used in CSV reading. The default uses the number of cores of - /// your cpu. - /// - /// Note that this only works if this is initialized with `CsvReader::from_path`. - /// Note that the number of cores is the maximum allowed number of threads. - pub fn with_n_threads(mut self, n: Option) -> Self { - self.n_threads = n; - self - } - /// The preferred way to initialize this builder. This allows the CSV file to be memory mapped /// and thereby greatly increases parsing performance. pub fn with_path>(mut self, path: Option

) -> Self { @@ -254,46 +275,10 @@ where self } - /// Raise an error if CSV is empty (otherwise return an empty frame) - pub fn raise_if_empty(mut self, toggle: bool) -> Self { - self.raise_if_empty = toggle; - self - } - - /// Reduce memory consumption at the expense of performance - pub fn low_memory(mut self, toggle: bool) -> Self { - self.low_memory = toggle; - self - } - - /// Set the `char` used as quote char. The default is `b'"'`. If set to `[None]` quoting is disabled. - pub fn with_quote_char(mut self, quote_char: Option) -> Self { - self.quote_char = quote_char; - self - } - - /// Automatically try to parse dates/ datetimes and time. If parsing fails, columns remain of dtype `[DataType::String]`. - pub fn with_try_parse_dates(mut self, toggle: bool) -> Self { - self.try_parse_dates = toggle; - self - } - pub fn with_predicate(mut self, predicate: Option>) -> Self { self.predicate = predicate; self } - - /// Truncate lines that are longer than the schema. - pub fn truncate_ragged_lines(mut self, toggle: bool) -> Self { - self.truncate_ragged_lines = toggle; - self - } - - /// Parse floats with decimals. - pub fn with_decimal_comma(mut self, toggle: bool) -> Self { - self.decimal_comma = toggle; - self - } } impl<'a> CsvReader<'a, File> { @@ -318,34 +303,34 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> { CoreReader::new( reader_bytes, self.n_rows, - self.skip_rows_before_header, + self.options.skip_rows, std::mem::take(&mut self.projection), - self.max_records, - self.separator, - self.has_header, - self.ignore_errors, - self.schema.clone(), + self.options.infer_schema_length, + Some(self.options.separator), + self.options.has_header, + self.options.ignore_errors, + self.options.schema.clone(), std::mem::take(&mut self.columns), - self.encoding, - self.n_threads, + self.options.encoding, + self.options.n_threads, schema, self.dtype_overwrite, self.sample_size, self.chunk_size, - self.low_memory, - std::mem::take(&mut self.comment_prefix), - self.quote_char, - self.eol_char, - std::mem::take(&mut self.null_values), + self.options.low_memory, + std::mem::take(&mut self.options.comment_prefix), + self.options.quote_char, + self.options.eol_char, + std::mem::take(&mut self.options.null_values), self.missing_is_null, std::mem::take(&mut self.predicate), to_cast, - self.skip_rows_after_header, + self.options.skip_rows_after_header, std::mem::take(&mut self.row_index), - self.try_parse_dates, - self.raise_if_empty, - self.truncate_ragged_lines, - self.decimal_comma, + self.options.try_parse_dates, + self.options.raise_if_empty, + self.options.truncate_ragged_lines, + self.options.decimal_comma, ) } @@ -403,26 +388,26 @@ impl<'a, R: MmapBytesReader + 'a> CsvReader<'a, R> { } pub fn batched_borrowed_mmap(&'a mut self) -> PolarsResult> { - if let Some(schema) = self.schema_overwrite.as_deref() { + if let Some(schema) = self.options.schema_overwrite.as_deref() { let (schema, to_cast, has_cat) = self.prepare_schema_overwrite(schema)?; let schema = Arc::new(schema); let csv_reader = self.core_reader(Some(schema), to_cast)?; csv_reader.batched_mmap(has_cat) } else { - let csv_reader = self.core_reader(self.schema.clone(), vec![])?; + let csv_reader = self.core_reader(self.options.schema.clone(), vec![])?; csv_reader.batched_mmap(false) } } pub fn batched_borrowed_read(&'a mut self) -> PolarsResult> { - if let Some(schema) = self.schema_overwrite.as_deref() { + if let Some(schema) = self.options.schema_overwrite.as_deref() { let (schema, to_cast, has_cat) = self.prepare_schema_overwrite(schema)?; let schema = Arc::new(schema); let csv_reader = self.core_reader(Some(schema), to_cast)?; csv_reader.batched_read(has_cat) } else { - let csv_reader = self.core_reader(self.schema.clone(), vec![])?; + let csv_reader = self.core_reader(self.options.schema.clone(), vec![])?; csv_reader.batched_read(false) } } @@ -440,20 +425,20 @@ impl<'a> CsvReader<'a, Box> { let (inferred_schema, _, _) = infer_file_schema( &reader_bytes, - self.separator.unwrap_or(b','), - self.max_records, - self.has_header, + self.options.separator, + self.options.infer_schema_length, + self.options.has_header, None, - &mut self.skip_rows_before_header, - self.skip_rows_after_header, - self.comment_prefix.as_ref(), - self.quote_char, - self.eol_char, - self.null_values.as_ref(), - self.try_parse_dates, - self.raise_if_empty, - &mut self.n_threads, - self.decimal_comma, + &mut self.options.skip_rows, + self.options.skip_rows_after_header, + self.options.comment_prefix.as_ref(), + self.options.quote_char, + self.options.eol_char, + self.options.null_values.as_ref(), + self.options.try_parse_dates, + self.options.raise_if_empty, + &mut self.options.n_threads, + self.options.decimal_comma, )?; let schema = Arc::new(inferred_schema); Ok(to_batched_owned_mmap(self, schema)) @@ -471,20 +456,20 @@ impl<'a> CsvReader<'a, Box> { let (inferred_schema, _, _) = infer_file_schema( &reader_bytes, - self.separator.unwrap_or(b','), - self.max_records, - self.has_header, + self.options.separator, + self.options.infer_schema_length, + self.options.has_header, None, - &mut self.skip_rows_before_header, - self.skip_rows_after_header, - self.comment_prefix.as_ref(), - self.quote_char, - self.eol_char, - self.null_values.as_ref(), - self.try_parse_dates, - self.raise_if_empty, - &mut self.n_threads, - self.decimal_comma, + &mut self.options.skip_rows, + self.options.skip_rows_after_header, + self.options.comment_prefix.as_ref(), + self.options.quote_char, + self.options.eol_char, + self.options.null_values.as_ref(), + self.options.try_parse_dates, + self.options.raise_if_empty, + &mut self.options.n_threads, + self.options.decimal_comma, )?; let schema = Arc::new(inferred_schema); Ok(to_batched_owned_read(self, schema)) @@ -501,44 +486,26 @@ where fn new(reader: R) -> Self { CsvReader { reader, + options: CsvReaderOptions::default(), rechunk: true, n_rows: None, - max_records: Some(128), - skip_rows_before_header: 0, projection: None, - separator: None, - has_header: true, - ignore_errors: false, - schema: None, columns: None, - encoding: CsvEncoding::Utf8, - n_threads: None, path: None, - schema_overwrite: None, dtype_overwrite: None, sample_size: 1024, chunk_size: 1 << 18, - low_memory: false, - comment_prefix: None, - eol_char: b'\n', - null_values: None, missing_is_null: true, predicate: None, - quote_char: Some(b'"'), - skip_rows_after_header: 0, - try_parse_dates: false, row_index: None, - raise_if_empty: true, - truncate_ragged_lines: false, - decimal_comma: false, } } /// Read the file and create the DataFrame. fn finish(mut self) -> PolarsResult { let rechunk = self.rechunk; - let schema_overwrite = self.schema_overwrite.clone(); - let low_memory = self.low_memory; + let schema_overwrite = self.options.schema_overwrite.clone(); + let low_memory = self.options.low_memory; #[cfg(feature = "dtype-categorical")] let mut _cat_lock = None; @@ -557,6 +524,7 @@ where #[cfg(feature = "dtype-categorical")] { let has_cat = self + .options .schema .clone() .map(|schema| { @@ -569,7 +537,7 @@ where _cat_lock = Some(polars_core::StringCacheHolder::hold()) } } - let mut csv_reader = self.core_reader(self.schema.clone(), vec![])?; + let mut csv_reader = self.core_reader(self.options.schema.clone(), vec![])?; csv_reader.as_df()? }; @@ -585,7 +553,7 @@ where #[cfg(feature = "temporal")] // only needed until we also can parse time columns in place - if self.try_parse_dates { + if self.options.try_parse_dates { // determine the schema that's given by the user. That should not be changed let fixed_schema = match (schema_overwrite, self.dtype_overwrite) { (Some(schema), _) => schema, diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs index 63ad38bcfa12..06277d5b054e 100644 --- a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs @@ -5,7 +5,7 @@ use super::*; pub struct CsvExec { pub path: PathBuf, pub schema: SchemaRef, - pub options: CsvParserOptions, + pub options: CsvReaderOptions, pub file_options: FileScanOptions, pub predicate: Option>, } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index b2423c832ed8..8d49bcb9ea41 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use polars_core::export::arrow::Either; use polars_core::POOL; use polars_io::csv::read::{ - BatchedCsvReaderMmap, BatchedCsvReaderRead, CsvEncoding, CsvParserOptions, CsvReader, + BatchedCsvReaderMmap, BatchedCsvReaderRead, CsvEncoding, CsvReader, CsvReaderOptions, }; use polars_plan::global::_set_n_rows_for_scan; use polars_plan::prelude::FileScanOptions; @@ -22,7 +22,7 @@ pub(crate) struct CsvSource { Option, *mut BatchedCsvReaderRead<'static>>>, n_threads: usize, path: Option, - options: Option, + options: Option, file_options: Option, verbose: bool, } @@ -106,7 +106,7 @@ impl CsvSource { pub(crate) fn new( path: PathBuf, schema: SchemaRef, - options: CsvParserOptions, + options: CsvReaderOptions, file_options: FileScanOptions, verbose: bool, ) -> PolarsResult { diff --git a/crates/polars-plan/src/logical_plan/builder_dsl.rs b/crates/polars-plan/src/logical_plan/builder_dsl.rs index ec31dd2e0514..fafcdfc4286f 100644 --- a/crates/polars-plan/src/logical_plan/builder_dsl.rs +++ b/crates/polars-plan/src/logical_plan/builder_dsl.rs @@ -2,7 +2,7 @@ use polars_core::prelude::*; #[cfg(feature = "parquet")] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] -use polars_io::csv::read::{CommentPrefix, CsvEncoding, CsvParserOptions, NullValues}; +use polars_io::csv::read::{CommentPrefix, CsvEncoding, CsvReaderOptions, NullValues}; #[cfg(feature = "ipc")] use polars_io::ipc::IpcScanOptions; #[cfg(feature = "parquet")] @@ -216,7 +216,7 @@ impl DslBuilder { file_options: options, predicate: None, scan_type: FileScan::Csv { - options: CsvParserOptions { + options: CsvReaderOptions { has_header, separator, ignore_errors, diff --git a/crates/polars-plan/src/logical_plan/conversion/scans.rs b/crates/polars-plan/src/logical_plan/conversion/scans.rs index f0523d68748d..84139ff5e713 100644 --- a/crates/polars-plan/src/logical_plan/conversion/scans.rs +++ b/crates/polars-plan/src/logical_plan/conversion/scans.rs @@ -121,7 +121,7 @@ pub(super) fn ipc_file_info( pub(super) fn csv_file_info( paths: &[PathBuf], file_options: &FileScanOptions, - csv_options: &mut CsvParserOptions, + csv_options: &mut CsvReaderOptions, ) -> PolarsResult { use std::io::Seek; diff --git a/crates/polars-plan/src/logical_plan/file_scan.rs b/crates/polars-plan/src/logical_plan/file_scan.rs index 43c3cb7da091..2777ad8a5e1b 100644 --- a/crates/polars-plan/src/logical_plan/file_scan.rs +++ b/crates/polars-plan/src/logical_plan/file_scan.rs @@ -1,7 +1,7 @@ use std::hash::{Hash, Hasher}; #[cfg(feature = "csv")] -use polars_io::csv::read::CsvParserOptions; +use polars_io::csv::read::CsvReaderOptions; #[cfg(feature = "ipc")] use polars_io::ipc::IpcScanOptions; #[cfg(feature = "parquet")] @@ -15,7 +15,7 @@ use super::*; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum FileScan { #[cfg(feature = "csv")] - Csv { options: CsvParserOptions }, + Csv { options: CsvReaderOptions }, #[cfg(feature = "parquet")] Parquet { options: ParquetOptions, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 10c558123a11..f6b8e5214e01 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -95,8 +95,8 @@ def read_csv( separator Single byte character to use as separator in the file. comment_prefix - A string, which can be up to 5 symbols in length, used to indicate - the start of a comment line. For instance, it can be set to `#` or `//`. + A string used to indicate the start of a comment line. Comment lines are skipped + during parsing. Common examples of comment prefixes are `#` and `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes. @@ -654,8 +654,8 @@ def read_csv_batched( separator Single byte character to use as separator in the file. comment_prefix - A string, which can be up to 5 symbols in length, used to indicate - the start of a comment line. For instance, it can be set to `#` or `//`. + A string used to indicate the start of a comment line. Comment lines are skipped + during parsing. Common examples of comment prefixes are `#` and `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes. @@ -944,8 +944,8 @@ def scan_csv( separator Single byte character to use as separator in the file. comment_prefix - A string, which can be up to 5 symbols in length, used to indicate - the start of a comment line. For instance, it can be set to `#` or `//`. + A string used to indicate the start of a comment line. Comment lines are skipped + during parsing. Common examples of comment prefixes are `#` and `//`. quote_char Single byte character used for csv quoting, default = `"`. Set to None to turn off special handling and escaping of quotes.