From 27a609cf746d9374e6877cf37c23042da69cf8ac Mon Sep 17 00:00:00 2001 From: ritchie Date: Sat, 27 Apr 2024 11:26:26 +0200 Subject: [PATCH] feat: Add option to disable globbing in csv --- crates/polars-lazy/src/scan/csv.rs | 29 ++++++++++++++----- .../polars-lazy/src/scan/file_list_reader.rs | 7 +++++ py-polars/polars/io/csv/functions.py | 12 ++++++++ py-polars/src/lazyframe/mod.rs | 4 ++- py-polars/tests/unit/io/test_csv.py | 12 ++++++++ 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index a81099e9cb51..9999d5219ce5 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -13,29 +13,30 @@ pub struct LazyCsvReader { path: PathBuf, paths: Arc<[PathBuf]>, separator: u8, - has_header: bool, - ignore_errors: bool, skip_rows: usize, n_rows: Option, - cache: bool, schema: Option, schema_overwrite: Option, - low_memory: bool, comment_prefix: Option, quote_char: Option, eol_char: u8, null_values: Option, - missing_is_null: bool, - truncate_ragged_lines: bool, infer_schema_length: Option, rechunk: bool, skip_rows_after_header: usize, encoding: CsvEncoding, row_index: Option, - try_parse_dates: bool, - raise_if_empty: bool, n_threads: Option, + cache: bool, + has_header: bool, + ignore_errors: bool, + low_memory: bool, + missing_is_null: bool, + truncate_ragged_lines: bool, decimal_comma: bool, + try_parse_dates: bool, + raise_if_empty: bool, + glob: bool, } #[cfg(feature = "csv")] @@ -72,6 +73,7 @@ impl LazyCsvReader { truncate_ragged_lines: false, n_threads: None, decimal_comma: false, + glob: true, } } @@ -238,6 +240,13 @@ impl LazyCsvReader { self } + #[must_use] + /// Expand path given via globbing rules. + pub fn with_glob(mut self, toggle: bool) -> Self { + self.glob = toggle; + self + } + /// Modify a schema before we run the lazy scanning. /// /// Important! Run this function latest in the builder! @@ -322,6 +331,10 @@ impl LazyFileListReader for LazyCsvReader { Ok(lf) } + fn glob(&self) -> bool { + self.glob + } + fn path(&self) -> &Path { &self.path } diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index ceb36334698a..bc19aea8a7d5 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -36,6 +36,9 @@ fn polars_glob(pattern: &str, cloud_options: Option<&CloudOptions>) -> PolarsRes pub trait LazyFileListReader: Clone { /// Get the final [LazyFrame]. fn finish(self) -> PolarsResult { + if !self.glob() { + return self.finish_no_glob(); + } if let Some(paths) = self.iter_paths()? { let lfs = paths .map(|r| { @@ -89,6 +92,10 @@ pub trait LazyFileListReader: Clone { /// It is recommended to always use [LazyFileListReader::finish] method. fn finish_no_glob(self) -> PolarsResult; + fn glob(&self) -> bool { + true + } + /// Path of the scanned file. /// It can be potentially a glob pattern. fn path(&self) -> &Path; diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index f6b8e5214e01..817fbe349de3 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -70,6 +70,7 @@ def read_csv( raise_if_empty: bool = True, truncate_ragged_lines: bool = False, decimal_comma: bool = False, + glob: bool = True, ) -> DataFrame: r""" Read a CSV file into a DataFrame. @@ -188,6 +189,8 @@ def read_csv( Truncate lines that are longer than the schema. decimal_comma Parse floats with decimal signs + glob + Expand path given via globbing rules. Returns ------- @@ -442,6 +445,7 @@ def read_csv( raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, + glob=glob, ) if new_columns: @@ -479,6 +483,7 @@ def _read_csv_impl( raise_if_empty: bool = True, truncate_ragged_lines: bool = False, decimal_comma: bool = False, + glob: bool = True, ) -> DataFrame: path: str | None if isinstance(source, (str, Path)): @@ -542,6 +547,7 @@ def _read_csv_impl( raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, + glob=glob, ) if columns is None: return scan.collect() @@ -925,6 +931,7 @@ def scan_csv( raise_if_empty: bool = True, truncate_ragged_lines: bool = False, decimal_comma: bool = False, + glob: bool = True, ) -> LazyFrame: r""" Lazily read from a CSV file or multiple files via glob patterns. @@ -1019,6 +1026,8 @@ def scan_csv( Truncate lines that are longer than the schema. decimal_comma Parse floats with decimal signs + glob + Expand path given via globbing rules. Returns ------- @@ -1138,6 +1147,7 @@ def with_column_names(cols: list[str]) -> list[str]: raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, + glob=glob, ) @@ -1169,6 +1179,7 @@ def _scan_csv_impl( raise_if_empty: bool = True, truncate_ragged_lines: bool = True, decimal_comma: bool = False, + glob: bool = True, ) -> LazyFrame: dtype_list: list[tuple[str, PolarsDataType]] | None = None if dtypes is not None: @@ -1210,5 +1221,6 @@ def _scan_csv_impl( truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, schema=schema, + glob=glob, ) return wrap_ldf(pylf) diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index 63585822ddac..253210cb18d9 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -141,7 +141,7 @@ impl PyLazyFrame { #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, - encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema + encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema ) )] fn new_from_csv( @@ -170,6 +170,7 @@ impl PyLazyFrame { raise_if_empty: bool, truncate_ragged_lines: bool, decimal_comma: bool, + glob: bool, schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); @@ -214,6 +215,7 @@ impl PyLazyFrame { .with_missing_is_null(!missing_utf8_is_empty_string) .truncate_ragged_lines(truncate_ragged_lines) .with_decimal_comma(decimal_comma) + .with_glob(glob) .raise_if_empty(raise_if_empty); if let Some(lambda) = with_schema_modify { diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 80d2058e61fe..577ca3bbe9e3 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2081,3 +2081,15 @@ def test_fsspec_not_available(monkeypatch: pytest.MonkeyPatch) -> None: pl.read_csv( "s3://foods/cabbage.csv", storage_options={"key": "key", "secret": "secret"} ) + + +@pytest.mark.write_disk() +@pytest.mark.skipif(os.environ.get("POLARS_FORCE_ASYNC") == "1", reason="only local") +def test_no_glob(tmpdir: Path) -> None: + df = pl.DataFrame({"foo": 1}) + p = tmpdir / "*.csv" + df.write_csv(str(p)) + p = tmpdir / "*1.csv" + df.write_csv(str(p)) + p = tmpdir / "*.csv" + assert_frame_equal(pl.read_csv(str(p), glob=False), df)