feat: Support SELECT * ILIKE wildcard syntax

feat: Support `SELECT * ILIKE` wildcard syntax feat: Support `SELECT * ILIKE` wildcard syntax in SQL interface
pola-rs · Jun 24, 2024 · 4be0f71 · 4be0f71
1 parent a69f6dd
commit 4be0f71
Show file tree

Hide file tree

Showing 3 changed files with 182 additions and 90 deletions.
diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs
@@ -1,6 +1,7 @@
 use std::cell::RefCell;
 use std::ops::Deref;
 
+use polars_core::export::regex::{escape, Regex};
 use polars_core::frame::row::Row;
 use polars_core::prelude::*;
 use polars_lazy::prelude::*;
@@ -23,6 +24,18 @@ use crate::sql_expr::{
 };
 use crate::table_functions::PolarsTableFunctions;
 
+struct RenameColumns {
+    before: Vec<String>,
+    after: Vec<String>,
+}
+
+struct SelectModifiers {
+    exclude: PlHashSet<String>, // SELECT * EXCLUDE
+    ilike: Option<Regex>,       // SELECT * ILIKE
+    rename: RenameColumns,      // SELECT * RENAME
+    replace: Vec<Expr>,         // SELECT * REPLACE
+}
+
 /// The SQLContext is the main entry point for executing SQL queries.
 #[derive(Clone)]
 pub struct SQLContext {
@@ -607,9 +620,15 @@ impl SQLContext {
         lf = self.process_where(lf, &select_stmt.selection)?;
 
         // 'SELECT *' modifiers
-        let mut excluded_cols = vec![];
-        let mut replace_exprs = vec![];
-        let mut rename_cols = (&mut vec![], &mut vec![]);
+        let mut select_modifiers = SelectModifiers {
+            ilike: None,
+            exclude: PlHashSet::new(),
+            rename: RenameColumns {
+                before: vec![],
+                after: vec![],
+            },
+            replace: vec![],
+        };
 
         // Column projections (SELECT clause)
         let projections: Vec<Expr> = select_stmt
@@ -628,9 +647,7 @@ impl SQLContext {
                         .process_qualified_wildcard(
                             obj_name,
                             wildcard_options,
-                            &mut excluded_cols,
-                            &mut rename_cols,
-                            &mut replace_exprs,
+                            &mut select_modifiers,
                             Some(schema.deref()),
                         )?,
                     SelectItem::Wildcard(wildcard_options) => {
@@ -642,9 +659,7 @@ impl SQLContext {
                         self.process_wildcard_additional_options(
                             cols,
                             wildcard_options,
-                            &mut excluded_cols,
-                            &mut rename_cols,
-                            &mut replace_exprs,
+                            &mut select_modifiers,
                             Some(schema.deref()),
                         )?
                     },
@@ -703,34 +718,36 @@ impl SQLContext {
         };
 
         lf = if group_by_keys.is_empty() {
-            lf = if query.order_by.is_empty() {
-                // No sort, select cols as given
-                lf.select(projections)
-            } else {
-                // Add projections to the base frame as any of the
-                // original columns may be required for the sort
-                lf = lf.with_columns(projections.clone());
-
-                // Final/selected cols (also ensures accurate ordinal position refs)
-                let retained_cols = projections
-                    .iter()
-                    .map(|e| {
-                        col(e
-                            .to_field(schema.deref(), Context::Default)
-                            .unwrap()
-                            .name
-                            .as_str())
-                    })
-                    .collect::<Vec<_>>();
+            // Establish final/selected cols, accounting for 'SELECT *' modifiers
+            let retained_cols: Vec<_> = projections
+                .iter()
+                .filter_map(|e| {
+                    let field = e.to_field(schema.deref(), Context::Default).ok()?;
+                    let name = field.name.to_string();
+                    if select_modifiers
+                        .ilike
+                        .as_ref()
+                        .map_or(true, |rx| rx.is_match(&name))
+                        && !select_modifiers.exclude.contains(&name)
+                    {
+                        Some(col(name.as_str()))
+                    } else {
+                        None
+                    }
+                })
+                .collect();
 
-                lf = self.process_order_by(lf, &query.order_by, Some(&retained_cols))?;
-                lf.select(retained_cols)
-            };
-            // Discard any excluded cols
-            if !excluded_cols.is_empty() {
-                lf.drop(excluded_cols)
+            if query.order_by.is_empty() {
+                if select_modifiers.ilike.is_some() || !select_modifiers.exclude.is_empty() {
+                    lf.select(projections).select(retained_cols)
+                } else {
+                    lf.select(projections)
+                }
             } else {
-                lf
+                // Include projections as any of the original columns may be required for the sort
+                lf = lf.with_columns(projections.clone());
+                self.process_order_by(lf, &query.order_by, Some(&retained_cols))?
+                    .select(retained_cols)
             }
         } else {
             lf = self.process_group_by(lf, &group_by_keys, &projections)?;
@@ -771,12 +788,15 @@ impl SQLContext {
             None => lf,
         };
 
-        // Apply final 'SELECT *' modifiers
-        if !replace_exprs.is_empty() {
-            lf = lf.with_columns(replace_exprs);
+        // Apply final 'SELECT *' REPLACE/RENAME modifiers
+        if !select_modifiers.replace.is_empty() {
+            lf = lf.with_columns(select_modifiers.replace);
         }
-        if !rename_cols.0.is_empty() {
-            lf = lf.rename(rename_cols.0, rename_cols.1);
+        if !select_modifiers.rename.before.is_empty() {
+            lf = lf.rename(
+                select_modifiers.rename.before,
+                select_modifiers.rename.after,
+            );
         }
         Ok(lf)
     }
@@ -1173,70 +1193,73 @@ impl SQLContext {
         &mut self,
         ObjectName(idents): &ObjectName,
         options: &WildcardAdditionalOptions,
-        excluded_cols: &mut Vec<String>,
-        rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
-        replace_exprs: &mut Vec<Expr>,
+        modifiers: &mut SelectModifiers,
         schema: Option<&Schema>,
     ) -> PolarsResult<Vec<Expr>> {
         let mut new_idents = idents.clone();
         new_idents.push(Ident::new("*"));
 
         let expr = resolve_compound_identifier(self, new_idents.deref(), schema);
-        self.process_wildcard_additional_options(
-            expr?,
-            options,
-            excluded_cols,
-            rename_cols,
-            replace_exprs,
-            schema,
-        )
+        self.process_wildcard_additional_options(expr?, options, modifiers, schema)
     }
 
     fn process_wildcard_additional_options(
         &mut self,
         exprs: Vec<Expr>,
         options: &WildcardAdditionalOptions,
-        excluded_cols: &mut Vec<String>,
-        rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
-        replace_exprs: &mut Vec<Expr>,
+        modifiers: &mut SelectModifiers,
         schema: Option<&Schema>,
     ) -> PolarsResult<Vec<Expr>> {
         // bail on (currently) unsupported wildcard options
         if options.opt_except.is_some() {
-            polars_bail!(SQLInterface: "EXCEPT wildcard option is unsupported (use EXCLUDE instead)")
-        } else if options.opt_ilike.is_some() {
-            polars_bail!(SQLInterface: "ILIKE wildcard option is currently unsupported")
+            polars_bail!(SQLInterface: "EXCEPT wildcard option is not supported (use EXCLUDE instead)")
+        } else if options.opt_exclude.is_some() && options.opt_ilike.is_some() {
+            polars_bail!(SQLInterface: "EXCLUDE and ILIKE wildcard options cannot be used together")
         } else if options.opt_rename.is_some() && options.opt_replace.is_some() {
             // pending an upstream fix: https://github.com/sqlparser-rs/sqlparser-rs/pull/1321
-            polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used simultaneously")
+            polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used together")
         }
 
         if let Some(items) = &options.opt_exclude {
-            *excluded_cols = match items {
-                ExcludeSelectItem::Single(ident) => vec![ident.value.clone()],
+            match items {
+                ExcludeSelectItem::Single(ident) => {
+                    modifiers.exclude.insert(ident.value.clone());
+                },
                 ExcludeSelectItem::Multiple(idents) => {
-                    idents.iter().map(|i| i.value.clone()).collect()
+                    modifiers
+                        .exclude
+                        .extend(idents.iter().map(|i| i.value.clone()));
                 },
             };
+        } else if let Some(item) = &options.opt_ilike {
+            let rx = escape(item.pattern.as_str())
+                .replace('%', ".*")
+                .replace('_', ".");
+
+            modifiers.ilike = Some(Regex::new(format!("^(?i){}$", rx).as_str()).unwrap());
         }
+
         if let Some(items) = &options.opt_rename {
             match items {
                 RenameSelectItem::Single(rename) => {
-                    rename_cols.0.push(rename.ident.value.clone());
-                    rename_cols.1.push(rename.alias.value.clone());
+                    modifiers.rename.before.push(rename.ident.value.clone());
+                    modifiers.rename.after.push(rename.alias.value.clone());
                 },
                 RenameSelectItem::Multiple(renames) => {
                     for rn in renames {
-                        rename_cols.0.push(rn.ident.value.clone());
-                        rename_cols.1.push(rn.alias.value.clone());
+                        modifiers.rename.before.push(rn.ident.value.clone());
+                        modifiers.rename.after.push(rn.alias.value.clone());
                     }
                 },
             }
         }
+
         if let Some(replacements) = &options.opt_replace {
             for rp in &replacements.items {
                 let replacement_expr = parse_sql_expr(&rp.expr, self, schema);
-                replace_exprs.push(replacement_expr?.alias(rp.column_name.value.as_str()));
+                modifiers
+                    .replace
+                    .push(replacement_expr?.alias(rp.column_name.value.as_str()));
             }
         }
         Ok(exprs)

diff --git a/py-polars/tests/unit/sql/test_structs.py b/py-polars/tests/unit/sql/test_structs.py
@@ -19,9 +19,17 @@ def df_struct() -> pl.DataFrame:
     ).select(pl.struct(pl.all()).alias("json_msg"))
 
 
-def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
+@pytest.mark.parametrize(
+    "order_by",
+    [
+        "ORDER BY json_msg.id DESC",
+        "ORDER BY 2 DESC",
+        "",
+    ],
+)
+def test_struct_field_selection(order_by: str, df_struct: pl.DataFrame) -> None:
     res = df_struct.sql(
-        """
+        f"""
         SELECT
           -- validate table alias resolution
           frame.json_msg.id AS ID,
@@ -32,10 +40,12 @@ def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
         WHERE
           json_msg.age > 20 AND
           json_msg.other.n IS NOT NULL  -- note: nested struct field
-        ORDER BY
-          json_msg.name DESC
+        {order_by}
         """
     )
+    if not order_by:
+        res = res.sort(by="ID", descending=True)
+
     expected = pl.DataFrame({"ID": [400, 200], "NAME": ["Zoe", "Bob"], "AGE": [45, 45]})
     assert_frame_equal(expected, res)