From 4be0f71e7519e9407e65ecaf368a670d8fca5e7c Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Tue, 25 Jun 2024 00:56:55 +0400 Subject: [PATCH] feat: Support `SELECT * ILIKE` wildcard syntax feat: Support `SELECT * ILIKE` wildcard syntax feat: Support `SELECT * ILIKE` wildcard syntax in SQL interface --- crates/polars-sql/src/context.rs | 155 ++++++++++-------- py-polars/tests/unit/sql/test_structs.py | 18 +- .../tests/unit/sql/test_wildcard_opts.py | 99 ++++++++--- 3 files changed, 182 insertions(+), 90 deletions(-) diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index ea86de06f4e5a..b20db31f47439 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -1,6 +1,7 @@ use std::cell::RefCell; use std::ops::Deref; +use polars_core::export::regex::{escape, Regex}; use polars_core::frame::row::Row; use polars_core::prelude::*; use polars_lazy::prelude::*; @@ -23,6 +24,18 @@ use crate::sql_expr::{ }; use crate::table_functions::PolarsTableFunctions; +struct RenameColumns { + before: Vec, + after: Vec, +} + +struct SelectModifiers { + exclude: PlHashSet, // SELECT * EXCLUDE + ilike: Option, // SELECT * ILIKE + rename: RenameColumns, // SELECT * RENAME + replace: Vec, // SELECT * REPLACE +} + /// The SQLContext is the main entry point for executing SQL queries. #[derive(Clone)] pub struct SQLContext { @@ -607,9 +620,15 @@ impl SQLContext { lf = self.process_where(lf, &select_stmt.selection)?; // 'SELECT *' modifiers - let mut excluded_cols = vec![]; - let mut replace_exprs = vec![]; - let mut rename_cols = (&mut vec![], &mut vec![]); + let mut select_modifiers = SelectModifiers { + ilike: None, + exclude: PlHashSet::new(), + rename: RenameColumns { + before: vec![], + after: vec![], + }, + replace: vec![], + }; // Column projections (SELECT clause) let projections: Vec = select_stmt @@ -628,9 +647,7 @@ impl SQLContext { .process_qualified_wildcard( obj_name, wildcard_options, - &mut excluded_cols, - &mut rename_cols, - &mut replace_exprs, + &mut select_modifiers, Some(schema.deref()), )?, SelectItem::Wildcard(wildcard_options) => { @@ -642,9 +659,7 @@ impl SQLContext { self.process_wildcard_additional_options( cols, wildcard_options, - &mut excluded_cols, - &mut rename_cols, - &mut replace_exprs, + &mut select_modifiers, Some(schema.deref()), )? }, @@ -703,34 +718,36 @@ impl SQLContext { }; lf = if group_by_keys.is_empty() { - lf = if query.order_by.is_empty() { - // No sort, select cols as given - lf.select(projections) - } else { - // Add projections to the base frame as any of the - // original columns may be required for the sort - lf = lf.with_columns(projections.clone()); - - // Final/selected cols (also ensures accurate ordinal position refs) - let retained_cols = projections - .iter() - .map(|e| { - col(e - .to_field(schema.deref(), Context::Default) - .unwrap() - .name - .as_str()) - }) - .collect::>(); + // Establish final/selected cols, accounting for 'SELECT *' modifiers + let retained_cols: Vec<_> = projections + .iter() + .filter_map(|e| { + let field = e.to_field(schema.deref(), Context::Default).ok()?; + let name = field.name.to_string(); + if select_modifiers + .ilike + .as_ref() + .map_or(true, |rx| rx.is_match(&name)) + && !select_modifiers.exclude.contains(&name) + { + Some(col(name.as_str())) + } else { + None + } + }) + .collect(); - lf = self.process_order_by(lf, &query.order_by, Some(&retained_cols))?; - lf.select(retained_cols) - }; - // Discard any excluded cols - if !excluded_cols.is_empty() { - lf.drop(excluded_cols) + if query.order_by.is_empty() { + if select_modifiers.ilike.is_some() || !select_modifiers.exclude.is_empty() { + lf.select(projections).select(retained_cols) + } else { + lf.select(projections) + } } else { - lf + // Include projections as any of the original columns may be required for the sort + lf = lf.with_columns(projections.clone()); + self.process_order_by(lf, &query.order_by, Some(&retained_cols))? + .select(retained_cols) } } else { lf = self.process_group_by(lf, &group_by_keys, &projections)?; @@ -771,12 +788,15 @@ impl SQLContext { None => lf, }; - // Apply final 'SELECT *' modifiers - if !replace_exprs.is_empty() { - lf = lf.with_columns(replace_exprs); + // Apply final 'SELECT *' REPLACE/RENAME modifiers + if !select_modifiers.replace.is_empty() { + lf = lf.with_columns(select_modifiers.replace); } - if !rename_cols.0.is_empty() { - lf = lf.rename(rename_cols.0, rename_cols.1); + if !select_modifiers.rename.before.is_empty() { + lf = lf.rename( + select_modifiers.rename.before, + select_modifiers.rename.after, + ); } Ok(lf) } @@ -1173,70 +1193,73 @@ impl SQLContext { &mut self, ObjectName(idents): &ObjectName, options: &WildcardAdditionalOptions, - excluded_cols: &mut Vec, - rename_cols: &mut (&mut Vec, &mut Vec), - replace_exprs: &mut Vec, + modifiers: &mut SelectModifiers, schema: Option<&Schema>, ) -> PolarsResult> { let mut new_idents = idents.clone(); new_idents.push(Ident::new("*")); let expr = resolve_compound_identifier(self, new_idents.deref(), schema); - self.process_wildcard_additional_options( - expr?, - options, - excluded_cols, - rename_cols, - replace_exprs, - schema, - ) + self.process_wildcard_additional_options(expr?, options, modifiers, schema) } fn process_wildcard_additional_options( &mut self, exprs: Vec, options: &WildcardAdditionalOptions, - excluded_cols: &mut Vec, - rename_cols: &mut (&mut Vec, &mut Vec), - replace_exprs: &mut Vec, + modifiers: &mut SelectModifiers, schema: Option<&Schema>, ) -> PolarsResult> { // bail on (currently) unsupported wildcard options if options.opt_except.is_some() { - polars_bail!(SQLInterface: "EXCEPT wildcard option is unsupported (use EXCLUDE instead)") - } else if options.opt_ilike.is_some() { - polars_bail!(SQLInterface: "ILIKE wildcard option is currently unsupported") + polars_bail!(SQLInterface: "EXCEPT wildcard option is not supported (use EXCLUDE instead)") + } else if options.opt_exclude.is_some() && options.opt_ilike.is_some() { + polars_bail!(SQLInterface: "EXCLUDE and ILIKE wildcard options cannot be used together") } else if options.opt_rename.is_some() && options.opt_replace.is_some() { // pending an upstream fix: https://github.com/sqlparser-rs/sqlparser-rs/pull/1321 - polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used simultaneously") + polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used together") } if let Some(items) = &options.opt_exclude { - *excluded_cols = match items { - ExcludeSelectItem::Single(ident) => vec![ident.value.clone()], + match items { + ExcludeSelectItem::Single(ident) => { + modifiers.exclude.insert(ident.value.clone()); + }, ExcludeSelectItem::Multiple(idents) => { - idents.iter().map(|i| i.value.clone()).collect() + modifiers + .exclude + .extend(idents.iter().map(|i| i.value.clone())); }, }; + } else if let Some(item) = &options.opt_ilike { + let rx = escape(item.pattern.as_str()) + .replace('%', ".*") + .replace('_', "."); + + modifiers.ilike = Some(Regex::new(format!("^(?i){}$", rx).as_str()).unwrap()); } + if let Some(items) = &options.opt_rename { match items { RenameSelectItem::Single(rename) => { - rename_cols.0.push(rename.ident.value.clone()); - rename_cols.1.push(rename.alias.value.clone()); + modifiers.rename.before.push(rename.ident.value.clone()); + modifiers.rename.after.push(rename.alias.value.clone()); }, RenameSelectItem::Multiple(renames) => { for rn in renames { - rename_cols.0.push(rn.ident.value.clone()); - rename_cols.1.push(rn.alias.value.clone()); + modifiers.rename.before.push(rn.ident.value.clone()); + modifiers.rename.after.push(rn.alias.value.clone()); } }, } } + if let Some(replacements) = &options.opt_replace { for rp in &replacements.items { let replacement_expr = parse_sql_expr(&rp.expr, self, schema); - replace_exprs.push(replacement_expr?.alias(rp.column_name.value.as_str())); + modifiers + .replace + .push(replacement_expr?.alias(rp.column_name.value.as_str())); } } Ok(exprs) diff --git a/py-polars/tests/unit/sql/test_structs.py b/py-polars/tests/unit/sql/test_structs.py index db965efcd86d6..73adeb4c06acb 100644 --- a/py-polars/tests/unit/sql/test_structs.py +++ b/py-polars/tests/unit/sql/test_structs.py @@ -19,9 +19,17 @@ def df_struct() -> pl.DataFrame: ).select(pl.struct(pl.all()).alias("json_msg")) -def test_struct_field_selection(df_struct: pl.DataFrame) -> None: +@pytest.mark.parametrize( + "order_by", + [ + "ORDER BY json_msg.id DESC", + "ORDER BY 2 DESC", + "", + ], +) +def test_struct_field_selection(order_by: str, df_struct: pl.DataFrame) -> None: res = df_struct.sql( - """ + f""" SELECT -- validate table alias resolution frame.json_msg.id AS ID, @@ -32,10 +40,12 @@ def test_struct_field_selection(df_struct: pl.DataFrame) -> None: WHERE json_msg.age > 20 AND json_msg.other.n IS NOT NULL -- note: nested struct field - ORDER BY - json_msg.name DESC + {order_by} """ ) + if not order_by: + res = res.sort(by="ID", descending=True) + expected = pl.DataFrame({"ID": [400, 200], "NAME": ["Zoe", "Bob"], "AGE": [45, 45]}) assert_frame_equal(expected, res) diff --git a/py-polars/tests/unit/sql/test_wildcard_opts.py b/py-polars/tests/unit/sql/test_wildcard_opts.py index ad17a215f7dac..c3bd75a7ef662 100644 --- a/py-polars/tests/unit/sql/test_wildcard_opts.py +++ b/py-polars/tests/unit/sql/test_wildcard_opts.py @@ -5,21 +5,30 @@ import pytest import polars as pl -from polars.exceptions import DuplicateError +from polars.exceptions import DuplicateError, SQLInterfaceError +from polars.testing import assert_frame_equal @pytest.fixture() def df() -> pl.DataFrame: - return pl.DataFrame({"num": [999, 666], "str": ["b", "a"], "val": [2.0, 0.5]}) + return pl.DataFrame( + { + "ID": [333, 999], + "FirstName": ["Bruce", "Clark"], + "LastName": ["Wayne", "Kent"], + "Address": ["The Batcave", "Fortress of Solitude"], + "City": ["Gotham", "Metropolis"], + } + ) @pytest.mark.parametrize( ("excluded", "expected"), [ - ("num", ["str", "val"]), - ("(val, num)", ["str"]), - ("(str, num)", ["val"]), - ("(str, val, num)", []), + ("ID", ["FirstName", "LastName", "Address", "City"]), + ("(ID)", ["FirstName", "LastName", "Address", "City"]), + ("(Address, LastName, FirstName)", ["ID", "City"]), + ('("ID", "FirstName", "LastName", "Address", "City")', []), ], ) def test_select_exclude( @@ -30,18 +39,65 @@ def test_select_exclude( assert df.sql(f"SELECT * EXCLUDE {excluded} FROM self").columns == expected +def test_select_exclude_order_by( + df: pl.DataFrame, +) -> None: + expected = pl.DataFrame( + { + "FirstName": ["Clark", "Bruce"], + "Address": ["Fortress of Solitude", "The Batcave"], + } + ) + for order_by in ("ORDER BY 2", "ORDER BY 1 DESC"): + actual = df.sql(f"SELECT * EXCLUDE (ID,LastName,City) FROM self {order_by}") + assert_frame_equal(actual, expected) + + def test_select_exclude_error(df: pl.DataFrame) -> None: - with pytest.raises(DuplicateError, match="the name 'num' is duplicate"): - # note: missing "()" around the exclude option results in dupe col - assert df.sql("SELECT * EXCLUDE val, num FROM self") + # EXCLUDE and ILIKE are not allowed together + with pytest.raises(SQLInterfaceError, match="ILIKE"): + assert df.sql("SELECT * EXCLUDE Address ILIKE '%o%' FROM self") + + # note: missing "()" around the exclude option results in dupe col + with pytest.raises(DuplicateError, match="the name 'City' is duplicate"): + assert df.sql("SELECT * EXCLUDE Address, City FROM self") + + +def test_ilike(df: pl.DataFrame) -> None: + assert df.sql("SELECT * ILIKE 'a%e' FROM self").columns == [] + assert df.sql("SELECT * ILIKE '%nam_' FROM self").columns == [ + "FirstName", + "LastName", + ] + assert df.sql("SELECT * ILIKE '%a%e%' FROM self").columns == [ + "FirstName", + "LastName", + "Address", + ] + assert df.sql( + """SELECT * ILIKE '%I%' RENAME (FirstName AS Name) FROM self""" + ).columns == [ + "ID", + "Name", + "City", + ] @pytest.mark.parametrize( ("renames", "expected"), [ - ("val AS value", ["num", "str", "value"]), - ("(num AS flt)", ["flt", "str", "val"]), - ("(val AS value, num AS flt)", ["flt", "str", "value"]), + ( + "Address AS Location", + ["ID", "FirstName", "LastName", "Location", "City"], + ), + ( + '(Address AS "Location")', + ["ID", "FirstName", "LastName", "Location", "City"], + ), + ( + '("Address" AS Location, "ID" AS PersonID)', + ["PersonID", "FirstName", "LastName", "Location", "City"], + ), ], ) def test_select_rename( @@ -53,27 +109,30 @@ def test_select_rename( @pytest.mark.parametrize( - ("replacements", "check_cols", "expected"), + ("replacements", "order_by", "check_cols", "expected"), [ ( - "(num // 3 AS num)", - ["num"], - [(333,), (222,)], + "(ID // 3 AS ID)", + "", + ["ID"], + [(111,), (333,)], ), ( - "((str || str) AS str, num / 3 AS num)", - ["num", "str"], - [(333, "bb"), (222, "aa")], + "((City || ':' || City) AS City, ID // 3 AS ID)", + "ORDER BY ID DESC", + ["City", "ID"], + [("Metropolis:Metropolis", 333), ("Gotham:Gotham", 111)], ), ], ) def test_select_replace( replacements: str, + order_by: str, check_cols: list[str], expected: list[tuple[Any]], df: pl.DataFrame, ) -> None: - res = df.sql(f"SELECT * REPLACE {replacements} FROM self") + res = df.sql(f"SELECT * REPLACE {replacements} FROM self {order_by}") assert res.select(check_cols).rows() == expected assert res.columns == df.columns