Skip to content

Commit

Permalink
feat: Support SELECT * ILIKE wildcard syntax
Browse files Browse the repository at this point in the history
feat: Support `SELECT * ILIKE` wildcard syntax

feat: Support `SELECT * ILIKE` wildcard syntax in SQL interface
  • Loading branch information
alexander-beedie committed Jun 24, 2024
1 parent a69f6dd commit 4be0f71
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 90 deletions.
155 changes: 89 additions & 66 deletions crates/polars-sql/src/context.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::cell::RefCell;
use std::ops::Deref;

use polars_core::export::regex::{escape, Regex};
use polars_core::frame::row::Row;
use polars_core::prelude::*;
use polars_lazy::prelude::*;
Expand All @@ -23,6 +24,18 @@ use crate::sql_expr::{
};
use crate::table_functions::PolarsTableFunctions;

struct RenameColumns {
before: Vec<String>,
after: Vec<String>,
}

struct SelectModifiers {
exclude: PlHashSet<String>, // SELECT * EXCLUDE
ilike: Option<Regex>, // SELECT * ILIKE
rename: RenameColumns, // SELECT * RENAME
replace: Vec<Expr>, // SELECT * REPLACE
}

/// The SQLContext is the main entry point for executing SQL queries.
#[derive(Clone)]
pub struct SQLContext {
Expand Down Expand Up @@ -607,9 +620,15 @@ impl SQLContext {
lf = self.process_where(lf, &select_stmt.selection)?;

// 'SELECT *' modifiers
let mut excluded_cols = vec![];
let mut replace_exprs = vec![];
let mut rename_cols = (&mut vec![], &mut vec![]);
let mut select_modifiers = SelectModifiers {
ilike: None,
exclude: PlHashSet::new(),
rename: RenameColumns {
before: vec![],
after: vec![],
},
replace: vec![],
};

// Column projections (SELECT clause)
let projections: Vec<Expr> = select_stmt
Expand All @@ -628,9 +647,7 @@ impl SQLContext {
.process_qualified_wildcard(
obj_name,
wildcard_options,
&mut excluded_cols,
&mut rename_cols,
&mut replace_exprs,
&mut select_modifiers,
Some(schema.deref()),
)?,
SelectItem::Wildcard(wildcard_options) => {
Expand All @@ -642,9 +659,7 @@ impl SQLContext {
self.process_wildcard_additional_options(
cols,
wildcard_options,
&mut excluded_cols,
&mut rename_cols,
&mut replace_exprs,
&mut select_modifiers,
Some(schema.deref()),
)?
},
Expand Down Expand Up @@ -703,34 +718,36 @@ impl SQLContext {
};

lf = if group_by_keys.is_empty() {
lf = if query.order_by.is_empty() {
// No sort, select cols as given
lf.select(projections)
} else {
// Add projections to the base frame as any of the
// original columns may be required for the sort
lf = lf.with_columns(projections.clone());

// Final/selected cols (also ensures accurate ordinal position refs)
let retained_cols = projections
.iter()
.map(|e| {
col(e
.to_field(schema.deref(), Context::Default)
.unwrap()
.name
.as_str())
})
.collect::<Vec<_>>();
// Establish final/selected cols, accounting for 'SELECT *' modifiers
let retained_cols: Vec<_> = projections
.iter()
.filter_map(|e| {
let field = e.to_field(schema.deref(), Context::Default).ok()?;
let name = field.name.to_string();
if select_modifiers
.ilike
.as_ref()
.map_or(true, |rx| rx.is_match(&name))
&& !select_modifiers.exclude.contains(&name)
{
Some(col(name.as_str()))
} else {
None
}
})
.collect();

lf = self.process_order_by(lf, &query.order_by, Some(&retained_cols))?;
lf.select(retained_cols)
};
// Discard any excluded cols
if !excluded_cols.is_empty() {
lf.drop(excluded_cols)
if query.order_by.is_empty() {
if select_modifiers.ilike.is_some() || !select_modifiers.exclude.is_empty() {
lf.select(projections).select(retained_cols)
} else {
lf.select(projections)
}
} else {
lf
// Include projections as any of the original columns may be required for the sort
lf = lf.with_columns(projections.clone());
self.process_order_by(lf, &query.order_by, Some(&retained_cols))?
.select(retained_cols)
}
} else {
lf = self.process_group_by(lf, &group_by_keys, &projections)?;
Expand Down Expand Up @@ -771,12 +788,15 @@ impl SQLContext {
None => lf,
};

// Apply final 'SELECT *' modifiers
if !replace_exprs.is_empty() {
lf = lf.with_columns(replace_exprs);
// Apply final 'SELECT *' REPLACE/RENAME modifiers
if !select_modifiers.replace.is_empty() {
lf = lf.with_columns(select_modifiers.replace);
}
if !rename_cols.0.is_empty() {
lf = lf.rename(rename_cols.0, rename_cols.1);
if !select_modifiers.rename.before.is_empty() {
lf = lf.rename(
select_modifiers.rename.before,
select_modifiers.rename.after,
);
}
Ok(lf)
}
Expand Down Expand Up @@ -1173,70 +1193,73 @@ impl SQLContext {
&mut self,
ObjectName(idents): &ObjectName,
options: &WildcardAdditionalOptions,
excluded_cols: &mut Vec<String>,
rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
replace_exprs: &mut Vec<Expr>,
modifiers: &mut SelectModifiers,
schema: Option<&Schema>,
) -> PolarsResult<Vec<Expr>> {
let mut new_idents = idents.clone();
new_idents.push(Ident::new("*"));

let expr = resolve_compound_identifier(self, new_idents.deref(), schema);
self.process_wildcard_additional_options(
expr?,
options,
excluded_cols,
rename_cols,
replace_exprs,
schema,
)
self.process_wildcard_additional_options(expr?, options, modifiers, schema)
}

fn process_wildcard_additional_options(
&mut self,
exprs: Vec<Expr>,
options: &WildcardAdditionalOptions,
excluded_cols: &mut Vec<String>,
rename_cols: &mut (&mut Vec<String>, &mut Vec<String>),
replace_exprs: &mut Vec<Expr>,
modifiers: &mut SelectModifiers,
schema: Option<&Schema>,
) -> PolarsResult<Vec<Expr>> {
// bail on (currently) unsupported wildcard options
if options.opt_except.is_some() {
polars_bail!(SQLInterface: "EXCEPT wildcard option is unsupported (use EXCLUDE instead)")
} else if options.opt_ilike.is_some() {
polars_bail!(SQLInterface: "ILIKE wildcard option is currently unsupported")
polars_bail!(SQLInterface: "EXCEPT wildcard option is not supported (use EXCLUDE instead)")
} else if options.opt_exclude.is_some() && options.opt_ilike.is_some() {
polars_bail!(SQLInterface: "EXCLUDE and ILIKE wildcard options cannot be used together")
} else if options.opt_rename.is_some() && options.opt_replace.is_some() {
// pending an upstream fix: https://github.com/sqlparser-rs/sqlparser-rs/pull/1321
polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used simultaneously")
polars_bail!(SQLInterface: "RENAME and REPLACE wildcard options cannot (yet) be used together")
}

if let Some(items) = &options.opt_exclude {
*excluded_cols = match items {
ExcludeSelectItem::Single(ident) => vec![ident.value.clone()],
match items {
ExcludeSelectItem::Single(ident) => {
modifiers.exclude.insert(ident.value.clone());
},
ExcludeSelectItem::Multiple(idents) => {
idents.iter().map(|i| i.value.clone()).collect()
modifiers
.exclude
.extend(idents.iter().map(|i| i.value.clone()));
},
};
} else if let Some(item) = &options.opt_ilike {
let rx = escape(item.pattern.as_str())
.replace('%', ".*")
.replace('_', ".");

modifiers.ilike = Some(Regex::new(format!("^(?i){}$", rx).as_str()).unwrap());
}

if let Some(items) = &options.opt_rename {
match items {
RenameSelectItem::Single(rename) => {
rename_cols.0.push(rename.ident.value.clone());
rename_cols.1.push(rename.alias.value.clone());
modifiers.rename.before.push(rename.ident.value.clone());
modifiers.rename.after.push(rename.alias.value.clone());
},
RenameSelectItem::Multiple(renames) => {
for rn in renames {
rename_cols.0.push(rn.ident.value.clone());
rename_cols.1.push(rn.alias.value.clone());
modifiers.rename.before.push(rn.ident.value.clone());
modifiers.rename.after.push(rn.alias.value.clone());
}
},
}
}

if let Some(replacements) = &options.opt_replace {
for rp in &replacements.items {
let replacement_expr = parse_sql_expr(&rp.expr, self, schema);
replace_exprs.push(replacement_expr?.alias(rp.column_name.value.as_str()));
modifiers
.replace
.push(replacement_expr?.alias(rp.column_name.value.as_str()));
}
}
Ok(exprs)
Expand Down
18 changes: 14 additions & 4 deletions py-polars/tests/unit/sql/test_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,17 @@ def df_struct() -> pl.DataFrame:
).select(pl.struct(pl.all()).alias("json_msg"))


def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
@pytest.mark.parametrize(
"order_by",
[
"ORDER BY json_msg.id DESC",
"ORDER BY 2 DESC",
"",
],
)
def test_struct_field_selection(order_by: str, df_struct: pl.DataFrame) -> None:
res = df_struct.sql(
"""
f"""
SELECT
-- validate table alias resolution
frame.json_msg.id AS ID,
Expand All @@ -32,10 +40,12 @@ def test_struct_field_selection(df_struct: pl.DataFrame) -> None:
WHERE
json_msg.age > 20 AND
json_msg.other.n IS NOT NULL -- note: nested struct field
ORDER BY
json_msg.name DESC
{order_by}
"""
)
if not order_by:
res = res.sort(by="ID", descending=True)

expected = pl.DataFrame({"ID": [400, 200], "NAME": ["Zoe", "Bob"], "AGE": [45, 45]})
assert_frame_equal(expected, res)

Expand Down
Loading

0 comments on commit 4be0f71

Please sign in to comment.