From 4be0f71e7519e9407e65ecaf368a670d8fca5e7c Mon Sep 17 00:00:00 2001
From: alexander-beedie <alexander.m.beedie@icloud.com>
Date: Tue, 25 Jun 2024 00:56:55 +0400
Subject: [PATCH] feat: Support `SELECT * ILIKE` wildcard syntax

feat: Support `SELECT * ILIKE` wildcard syntax

feat: Support `SELECT * ILIKE` wildcard syntax in SQL interface
---
 crates/polars-sql/src/context.rs              | 155 ++++++++++--------
 py-polars/tests/unit/sql/test_structs.py      |  18 +-
 .../tests/unit/sql/test_wildcard_opts.py      |  99 ++++++++---
 3 files changed, 182 insertions(+), 90 deletions(-)
diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs
index ea86de06f4e5a..b20db31f47439 100644
--- a/crates/polars-sql/src/context.rs
+++ b/crates/polars-sql/src/context.rs
@@ -1,6 +1,7 @@
 use std::cell::RefCell;
 use std::ops::Deref;
 
+use polars_core::export::regex::{escape, Regex};
 use polars_core::frame::row::Row;
 use polars_core::prelude::*;
 use polars_lazy::prelude::*;
@@ -23,6 +24,18 @@ use crate::sql_expr::{
 };
 use crate::table_functions::PolarsTableFunctions;
 
+struct RenameColumns {
+    before: Vec<String>,
+    after: Vec<String>,
+}
+
+struct SelectModifiers {
+    exclude: PlHashSet<String>, // SELECT * EXCLUDE
+    ilike: Option<Regex>,       // SELECT * ILIKE
+    rename: RenameColumns,      // SELECT * RENAME
+    replace: Vec<Expr>,         // SELECT * REPLACE
+}
+
 /// The SQLContext is the main entry point for executing SQL queries.
 #[derive(Clone)]
 pub struct SQLContext {
@@ -607,9 +620,15 @@ impl SQLContext {
         lf = self.process_where(lf, &select_stmt.selection)?;
 
         // 'SELECT *' modifiers
-        let mut excluded_cols = vec![];
-        let mut replace_exprs = vec![];
-        let mut rename_cols = (&mut vec![], &mut vec![]);
+        let mut select_modifiers = SelectModifiers {
+            ilike: None,
+            exclude: PlHashSet::new(),
+            rename: RenameColumns {
+                before: vec![],
+                after: vec![],
+            },
+            replace: vec![],
+        };
 
         // Column projections (SELECT clause)
         let projections: Vec<Expr> = select_stmt
@@ -628,9 +647,7 @@ impl SQLContext {
                         .process_qualified_wildcard(
                             obj_name,
                             wildcard_options,
-                            &mut excluded_cols,
-                            &mut rename_cols,
-                            &mut replace_exprs,
+                            &mut select_modifiers,
                             Some(schema.deref()),
                         )?,
                     SelectItem::Wildcard(wildcard_options) => {
@@ -642,9 +659,7 @@ impl SQLContext {
                         self.process_wildcard_additional_options(
                             cols,
                             wildcard_options,
-                            &mut excluded_cols,
-                            &mut rename_cols,
-                            &mut replace_exprs,
+                            &mut select_modifiers,
                             Some(schema.deref()),
                         )?
                     },
@@ -703,34 +718,36 @@ impl SQLContext {
         };
 
         lf = if group_by_keys.is_empty() {
-            lf = if query.order_by.is_empty() {
-                // No sort, select cols as given
-                lf.select(projections)
-            } else {
-                // Add projections to the base frame as any of the
-                // original columns may be required for the sort
-                lf = lf.with_columns(projections.clone());
-
-                // Final/selected cols (also ensures accurate ordinal position refs)
-                let retained_cols = projections
-                    .iter()
-                    .map(|e| {
-                        col(e
-                            .to_field(schema.deref(), Context::Default)
-                            .unwrap()
-                            .name
-                            .as_str())
-                    })
-                    .collect::<Vec<_>>();
+            // Establish final/selected cols, accounting for 'SELECT *' modifiers
+            let retained_cols: Vec<_> = projections
+                .iter()
+                .filter_map(|e| {
+                    let field = e.to_field(schema.deref(), Context::Default).ok()?;
+                    let name = field.name.to_string();
+                    if select_modifiers
+                        .ilike
+                        .as_ref()
+                        .map_or(true, |rx| rx.is_match(&name))
+                        && !select_modifiers.exclude.contains(&name)
+                    {
+                        Some(col(name.as_str()))
+                    } else {
+                        None
+                    }
+                })
+                .collect();
 
-                lf = self.process_order_by(lf, &query.order_by, Some(&retained_cols))?;
-                lf.select(retained_cols)
-            };
-            // Discard any excluded cols
-            if !excluded_cols.is_empty() {
-                lf.drop(excluded_cols)
+            if query.order_by.is_empty() {
+                if select_modifiers.ilike.is_some() || !select_modifiers.exclude.is_empty() {
+                    lf.select(projections).select(retained_cols)
+                } else {
+                    lf.select(projections)
+                }
             } else {
-                lf
+                // Include projections as any of the original columns may be required for the sort
+                lf = lf.with_columns(projections.clone());
+                self.process_order_by(lf, &query.order_by, Some(&retained_cols))?
+                    .select(retained_cols)
             }
         } else {
             lf = self.process_group_by(lf, &group_by_keys, &projections)?;
@@ -771,12 +788,15 @@ impl SQLContext {
             None => lf,
         };
 
-        // Apply final 'SELECT *' modifiers
-        if !replace_exprs.is_empty() {
-            lf = lf.with_columns(replace_exprs);
+        // Apply final 'SELECT *' REPLACE/RENAME modifiers
+        if !select_modifiers.replace.is_empty() {
+            lf = lf.with_columns(select_modifiers.replace);
         }
-        if !rename_cols.0.is_empty() {
-            lf = lf.rename(rename_cols.0, rename_cols.1);
+        if !select_modifiers.rename.before.is_empty() {
+            lf = lf.rename(
+                select_modifiers.rename.before,
+                select_modifiers.rename.after,
+            );
         }
         Ok(lf)
     }
@@ -1173,70 +1193,73 @@ impl SQLContext {
         &mut self,
         ObjectName(idents): &ObjectName,
         options: &WildcardAdditionalOptions,
-        excluded_cols: &mut Vec<String>,
-        rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
-        replace_exprs: &mut Vec<Expr>,
+        modifiers: &mut SelectModifiers,
         schema: Option<&Schema>,
     ) -> PolarsResult<Vec<Expr>> {
         let mut new_idents = idents.clone();
         new_idents.push(Ident::new("*"));
 
         let expr = resolve_compound_identifier(self, new_idents.deref(), schema);
-        self.process_wildcard_additional_options(
-            expr?,
-            options,
-            excluded_cols,
-            rename_cols,
-            replace_exprs,
-            schema,
-        )
+        self.process_wildcard_additional_options(expr?, options, modifiers, schema)
     }
 
     fn process_wildcard_additional_options(
         &mut self,
         exprs: Vec<Expr>,
         options: &WildcardAdditionalOptions,
-        excluded_cols: &mut Vec<String>,
-        rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
-        replace_exprs: &mut Vec<Expr>,
+        modifiers: &mut SelectModifiers,
         schema: Option<&Schema>,
     ) -> PolarsResult<Vec<Expr>> {
         // bail on (currently) unsupported wildcard options
         if options.opt_except.is_some() {
-            polars_bail!(SQLInterface: "EXCEPT wildcard option is unsupported (use EXCLUDE instead)")
-        } else if options.opt_ilike.is_some() {
-            polars_bail!(SQLInterface: "ILIKE wildcard option is currently unsupported")
+            polars_bail!(SQLInterface: "EXCEPT wildcard option is not supported (use EXCLUDE instead)")
+        } else if options.opt_exclude.is_some() && options.opt_ilike.is_some() {
+            polars_bail!(SQLInterface: "EXCLUDE and ILIKE wildcard options cannot be used together")
         } else if options.opt_rename.is_some() && options.opt_replace.is_some() {
             // pending an upstream fix: https://github.com/sqlparser-rs/sqlparser-rs/pull/1321
-            polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used simultaneously")
+            polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used together")
         }
 
         if let Some(items) = &options.opt_exclude {
-            *excluded_cols = match items {
-                ExcludeSelectItem::Single(ident) => vec![ident.value.clone()],
+            match items {
+                ExcludeSelectItem::Single(ident) => {
+                    modifiers.exclude.insert(ident.value.clone());
+                },
                 ExcludeSelectItem::Multiple(idents) => {
-                    idents.iter().map(|i| i.value.clone()).collect()
+                    modifiers
+                        .exclude
+                        .extend(idents.iter().map(|i| i.value.clone()));
                 },
             };
+        } else if let Some(item) = &options.opt_ilike {
+            let rx = escape(item.pattern.as_str())
+                .replace('%', ".*")
+                .replace('_', ".");
+
+            modifiers.ilike = Some(Regex::new(format!("^(?i){}$", rx).as_str()).unwrap());
         }
+
         if let Some(items) = &options.opt_rename {
             match items {
                 RenameSelectItem::Single(rename) => {
-                    rename_cols.0.push(rename.ident.value.clone());
-                    rename_cols.1.push(rename.alias.value.clone());
+                    modifiers.rename.before.push(rename.ident.value.clone());
+                    modifiers.rename.after.push(rename.alias.value.clone());
                 },
                 RenameSelectItem::Multiple(renames) => {
                     for rn in renames {
-                        rename_cols.0.push(rn.ident.value.clone());
-                        rename_cols.1.push(rn.alias.value.clone());
+                        modifiers.rename.before.push(rn.ident.value.clone());
+                        modifiers.rename.after.push(rn.alias.value.clone());
                     }
                 },
             }
         }
+
         if let Some(replacements) = &options.opt_replace {
             for rp in &replacements.items {
                 let replacement_expr = parse_sql_expr(&rp.expr, self, schema);
-                replace_exprs.push(replacement_expr?.alias(rp.column_name.value.as_str()));
+                modifiers
+                    .replace
+                    .push(replacement_expr?.alias(rp.column_name.value.as_str()));
             }
         }
         Ok(exprs)
diff --git a/py-polars/tests/unit/sql/test_structs.py b/py-polars/tests/unit/sql/test_structs.py
index db965efcd86d6..73adeb4c06acb 100644
--- a/py-polars/tests/unit/sql/test_structs.py
+++ b/py-polars/tests/unit/sql/test_structs.py
@@ -19,9 +19,17 @@ def df_struct() -> pl.DataFrame:
     ).select(pl.struct(pl.all()).alias("json_msg"))
 
 
-def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
+@pytest.mark.parametrize(
+    "order_by",
+    [
+        "ORDER BY json_msg.id DESC",
+        "ORDER BY 2 DESC",
+        "",
+    ],
+)
+def test_struct_field_selection(order_by: str, df_struct: pl.DataFrame) -> None:
     res = df_struct.sql(
-        """
+        f"""
         SELECT
           -- validate table alias resolution
           frame.json_msg.id AS ID,
@@ -32,10 +40,12 @@ def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
         WHERE
           json_msg.age > 20 AND
           json_msg.other.n IS NOT NULL  -- note: nested struct field
-        ORDER BY
-          json_msg.name DESC
+        {order_by}
         """
     )
+    if not order_by:
+        res = res.sort(by="ID", descending=True)
+
     expected = pl.DataFrame({"ID": [400, 200], "NAME": ["Zoe", "Bob"], "AGE": [45, 45]})
     assert_frame_equal(expected, res)
 
diff --git a/py-polars/tests/unit/sql/test_wildcard_opts.py b/py-polars/tests/unit/sql/test_wildcard_opts.py
index ad17a215f7dac..c3bd75a7ef662 100644
--- a/py-polars/tests/unit/sql/test_wildcard_opts.py
+++ b/py-polars/tests/unit/sql/test_wildcard_opts.py
@@ -5,21 +5,30 @@
 import pytest
 
 import polars as pl
-from polars.exceptions import DuplicateError
+from polars.exceptions import DuplicateError, SQLInterfaceError
+from polars.testing import assert_frame_equal
 
 
 @pytest.fixture()
 def df() -> pl.DataFrame:
-    return pl.DataFrame({"num": [999, 666], "str": ["b", "a"], "val": [2.0, 0.5]})
+    return pl.DataFrame(
+        {
+            "ID": [333, 999],
+            "FirstName": ["Bruce", "Clark"],
+            "LastName": ["Wayne", "Kent"],
+            "Address": ["The Batcave", "Fortress of Solitude"],
+            "City": ["Gotham", "Metropolis"],
+        }
+    )
 
 
 @pytest.mark.parametrize(
     ("excluded", "expected"),
     [
-        ("num", ["str", "val"]),
-        ("(val, num)", ["str"]),
-        ("(str, num)", ["val"]),
-        ("(str, val, num)", []),
+        ("ID", ["FirstName", "LastName", "Address", "City"]),
+        ("(ID)", ["FirstName", "LastName", "Address", "City"]),
+        ("(Address, LastName, FirstName)", ["ID", "City"]),
+        ('("ID", "FirstName", "LastName", "Address", "City")', []),
     ],
 )
 def test_select_exclude(
@@ -30,18 +39,65 @@ def test_select_exclude(
     assert df.sql(f"SELECT * EXCLUDE {excluded} FROM self").columns == expected
 
 
+def test_select_exclude_order_by(
+    df: pl.DataFrame,
+) -> None:
+    expected = pl.DataFrame(
+        {
+            "FirstName": ["Clark", "Bruce"],
+            "Address": ["Fortress of Solitude", "The Batcave"],
+        }
+    )
+    for order_by in ("ORDER BY 2", "ORDER BY 1 DESC"):
+        actual = df.sql(f"SELECT * EXCLUDE (ID,LastName,City) FROM self {order_by}")
+        assert_frame_equal(actual, expected)
+
+
 def test_select_exclude_error(df: pl.DataFrame) -> None:
-    with pytest.raises(DuplicateError, match="the name 'num' is duplicate"):
-        # note: missing "()" around the exclude option results in dupe col
-        assert df.sql("SELECT * EXCLUDE val, num FROM self")
+    # EXCLUDE and ILIKE are not allowed together
+    with pytest.raises(SQLInterfaceError, match="ILIKE"):
+        assert df.sql("SELECT * EXCLUDE Address ILIKE '%o%' FROM self")
+
+    # note: missing "()" around the exclude option results in dupe col
+    with pytest.raises(DuplicateError, match="the name 'City' is duplicate"):
+        assert df.sql("SELECT * EXCLUDE Address, City FROM self")
+
+
+def test_ilike(df: pl.DataFrame) -> None:
+    assert df.sql("SELECT * ILIKE 'a%e' FROM self").columns == []
+    assert df.sql("SELECT * ILIKE '%nam_' FROM self").columns == [
+        "FirstName",
+        "LastName",
+    ]
+    assert df.sql("SELECT * ILIKE '%a%e%' FROM self").columns == [
+        "FirstName",
+        "LastName",
+        "Address",
+    ]
+    assert df.sql(
+        """SELECT * ILIKE '%I%' RENAME (FirstName AS Name) FROM self"""
+    ).columns == [
+        "ID",
+        "Name",
+        "City",
+    ]
 
 
 @pytest.mark.parametrize(
     ("renames", "expected"),
     [
-        ("val AS value", ["num", "str", "value"]),
-        ("(num AS flt)", ["flt", "str", "val"]),
-        ("(val AS value, num AS flt)", ["flt", "str", "value"]),
+        (
+            "Address AS Location",
+            ["ID", "FirstName", "LastName", "Location", "City"],
+        ),
+        (
+            '(Address AS "Location")',
+            ["ID", "FirstName", "LastName", "Location", "City"],
+        ),
+        (
+            '("Address" AS Location, "ID" AS PersonID)',
+            ["PersonID", "FirstName", "LastName", "Location", "City"],
+        ),
     ],
 )
 def test_select_rename(
@@ -53,27 +109,30 @@ def test_select_rename(
 
 
 @pytest.mark.parametrize(
-    ("replacements", "check_cols", "expected"),
+    ("replacements", "order_by", "check_cols", "expected"),
     [
         (
-            "(num // 3 AS num)",
-            ["num"],
-            [(333,), (222,)],
+            "(ID // 3 AS ID)",
+            "",
+            ["ID"],
+            [(111,), (333,)],
         ),
         (
-            "((str || str) AS str, num / 3 AS num)",
-            ["num", "str"],
-            [(333, "bb"), (222, "aa")],
+            "((City || ':' || City) AS City, ID // 3 AS ID)",
+            "ORDER BY ID DESC",
+            ["City", "ID"],
+            [("Metropolis:Metropolis", 333), ("Gotham:Gotham", 111)],
         ),
     ],
 )
 def test_select_replace(
     replacements: str,
+    order_by: str,
     check_cols: list[str],
     expected: list[tuple[Any]],
     df: pl.DataFrame,
 ) -> None:
-    res = df.sql(f"SELECT * REPLACE {replacements} FROM self")
+    res = df.sql(f"SELECT * REPLACE {replacements} FROM self {order_by}")
 
     assert res.select(check_cols).rows() == expected
     assert res.columns == df.columns