From 27a609cf746d9374e6877cf37c23042da69cf8ac Mon Sep 17 00:00:00 2001
From: ritchie <ritchie46@gmail.com>
Date: Sat, 27 Apr 2024 11:26:26 +0200
Subject: [PATCH] feat: Add option to disable globbing in csv

---
 crates/polars-lazy/src/scan/csv.rs            | 29 ++++++++++++++-----
 .../polars-lazy/src/scan/file_list_reader.rs  |  7 +++++
 py-polars/polars/io/csv/functions.py          | 12 ++++++++
 py-polars/src/lazyframe/mod.rs                |  4 ++-
 py-polars/tests/unit/io/test_csv.py           | 12 ++++++++
 5 files changed, 55 insertions(+), 9 deletions(-)
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index a81099e9cb51..9999d5219ce5 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -13,29 +13,30 @@ pub struct LazyCsvReader {
     path: PathBuf,
     paths: Arc<[PathBuf]>,
     separator: u8,
-    has_header: bool,
-    ignore_errors: bool,
     skip_rows: usize,
     n_rows: Option<usize>,
-    cache: bool,
     schema: Option<SchemaRef>,
     schema_overwrite: Option<SchemaRef>,
-    low_memory: bool,
     comment_prefix: Option<CommentPrefix>,
     quote_char: Option<u8>,
     eol_char: u8,
     null_values: Option<NullValues>,
-    missing_is_null: bool,
-    truncate_ragged_lines: bool,
     infer_schema_length: Option<usize>,
     rechunk: bool,
     skip_rows_after_header: usize,
     encoding: CsvEncoding,
     row_index: Option<RowIndex>,
-    try_parse_dates: bool,
-    raise_if_empty: bool,
     n_threads: Option<usize>,
+    cache: bool,
+    has_header: bool,
+    ignore_errors: bool,
+    low_memory: bool,
+    missing_is_null: bool,
+    truncate_ragged_lines: bool,
     decimal_comma: bool,
+    try_parse_dates: bool,
+    raise_if_empty: bool,
+    glob: bool,
 }
 
 #[cfg(feature = "csv")]
@@ -72,6 +73,7 @@ impl LazyCsvReader {
             truncate_ragged_lines: false,
             n_threads: None,
             decimal_comma: false,
+            glob: true,
         }
     }
 
@@ -238,6 +240,13 @@ impl LazyCsvReader {
         self
     }
 
+    #[must_use]
+    /// Expand path given via globbing rules.
+    pub fn with_glob(mut self, toggle: bool) -> Self {
+        self.glob = toggle;
+        self
+    }
+
     /// Modify a schema before we run the lazy scanning.
     ///
     /// Important! Run this function latest in the builder!
@@ -322,6 +331,10 @@ impl LazyFileListReader for LazyCsvReader {
         Ok(lf)
     }
 
+    fn glob(&self) -> bool {
+        self.glob
+    }
+
     fn path(&self) -> &Path {
         &self.path
     }
diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs
index ceb36334698a..bc19aea8a7d5 100644
--- a/crates/polars-lazy/src/scan/file_list_reader.rs
+++ b/crates/polars-lazy/src/scan/file_list_reader.rs
@@ -36,6 +36,9 @@ fn polars_glob(pattern: &str, cloud_options: Option<&CloudOptions>) -> PolarsRes
 pub trait LazyFileListReader: Clone {
     /// Get the final [LazyFrame].
     fn finish(self) -> PolarsResult<LazyFrame> {
+        if !self.glob() {
+            return self.finish_no_glob();
+        }
         if let Some(paths) = self.iter_paths()? {
             let lfs = paths
                 .map(|r| {
@@ -89,6 +92,10 @@ pub trait LazyFileListReader: Clone {
     /// It is recommended to always use [LazyFileListReader::finish] method.
     fn finish_no_glob(self) -> PolarsResult<LazyFrame>;
 
+    fn glob(&self) -> bool {
+        true
+    }
+
     /// Path of the scanned file.
     /// It can be potentially a glob pattern.
     fn path(&self) -> &Path;
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index f6b8e5214e01..817fbe349de3 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -70,6 +70,7 @@ def read_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
     decimal_comma: bool = False,
+    glob: bool = True,
 ) -> DataFrame:
     r"""
     Read a CSV file into a DataFrame.
@@ -188,6 +189,8 @@ def read_csv(
         Truncate lines that are longer than the schema.
     decimal_comma
         Parse floats with decimal signs
+    glob
+        Expand path given via globbing rules.
 
     Returns
     -------
@@ -442,6 +445,7 @@ def read_csv(
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
             decimal_comma=decimal_comma,
+            glob=glob,
         )
 
     if new_columns:
@@ -479,6 +483,7 @@ def _read_csv_impl(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
     decimal_comma: bool = False,
+    glob: bool = True,
 ) -> DataFrame:
     path: str | None
     if isinstance(source, (str, Path)):
@@ -542,6 +547,7 @@ def _read_csv_impl(
             raise_if_empty=raise_if_empty,
             truncate_ragged_lines=truncate_ragged_lines,
             decimal_comma=decimal_comma,
+            glob=glob,
         )
         if columns is None:
             return scan.collect()
@@ -925,6 +931,7 @@ def scan_csv(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = False,
     decimal_comma: bool = False,
+    glob: bool = True,
 ) -> LazyFrame:
     r"""
     Lazily read from a CSV file or multiple files via glob patterns.
@@ -1019,6 +1026,8 @@ def scan_csv(
         Truncate lines that are longer than the schema.
     decimal_comma
         Parse floats with decimal signs
+    glob
+        Expand path given via globbing rules.
 
     Returns
     -------
@@ -1138,6 +1147,7 @@ def with_column_names(cols: list[str]) -> list[str]:
         raise_if_empty=raise_if_empty,
         truncate_ragged_lines=truncate_ragged_lines,
         decimal_comma=decimal_comma,
+        glob=glob,
     )
 
 
@@ -1169,6 +1179,7 @@ def _scan_csv_impl(
     raise_if_empty: bool = True,
     truncate_ragged_lines: bool = True,
     decimal_comma: bool = False,
+    glob: bool = True,
 ) -> LazyFrame:
     dtype_list: list[tuple[str, PolarsDataType]] | None = None
     if dtypes is not None:
@@ -1210,5 +1221,6 @@ def _scan_csv_impl(
         truncate_ragged_lines=truncate_ragged_lines,
         decimal_comma=decimal_comma,
         schema=schema,
+        glob=glob,
     )
     return wrap_ldf(pylf)
diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs
index 63585822ddac..253210cb18d9 100644
--- a/py-polars/src/lazyframe/mod.rs
+++ b/py-polars/src/lazyframe/mod.rs
@@ -141,7 +141,7 @@ impl PyLazyFrame {
     #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
         low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string,
         infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
-        encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema
+        encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema
     )
     )]
     fn new_from_csv(
@@ -170,6 +170,7 @@ impl PyLazyFrame {
         raise_if_empty: bool,
         truncate_ragged_lines: bool,
         decimal_comma: bool,
+        glob: bool,
         schema: Option<Wrap<Schema>>,
     ) -> PyResult<Self> {
         let null_values = null_values.map(|w| w.0);
@@ -214,6 +215,7 @@ impl PyLazyFrame {
             .with_missing_is_null(!missing_utf8_is_empty_string)
             .truncate_ragged_lines(truncate_ragged_lines)
             .with_decimal_comma(decimal_comma)
+            .with_glob(glob)
             .raise_if_empty(raise_if_empty);
 
         if let Some(lambda) = with_schema_modify {
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index 80d2058e61fe..577ca3bbe9e3 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -2081,3 +2081,15 @@ def test_fsspec_not_available(monkeypatch: pytest.MonkeyPatch) -> None:
         pl.read_csv(
             "s3://foods/cabbage.csv", storage_options={"key": "key", "secret": "secret"}
         )
+
+
+@pytest.mark.write_disk()
+@pytest.mark.skipif(os.environ.get("POLARS_FORCE_ASYNC") == "1", reason="only local")
+def test_no_glob(tmpdir: Path) -> None:
+    df = pl.DataFrame({"foo": 1})
+    p = tmpdir / "*.csv"
+    df.write_csv(str(p))
+    p = tmpdir / "*1.csv"
+    df.write_csv(str(p))
+    p = tmpdir / "*.csv"
+    assert_frame_equal(pl.read_csv(str(p), glob=False), df)