Skip to content

Commit

Permalink
feat: Add is_column_selection() to expression meta, enhance `expand…
Browse files Browse the repository at this point in the history
…_selector` (#16479)
  • Loading branch information
alexander-beedie authored May 26, 2024
1 parent 9eedeb9 commit cd04f3d
Show file tree
Hide file tree
Showing 9 changed files with 169 additions and 19 deletions.
20 changes: 20 additions & 0 deletions crates/polars-plan/src/dsl/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ impl MetaNameSpace {
}
}

/// Indicate if this expression only selects columns; the presence of any
/// transform operations will cause the check to return `false`, though
/// aliasing of the selected columns is optionally allowed.
pub fn is_column_selection(&self, allow_aliasing: bool) -> bool {
self.0.into_iter().all(|e| match e {
Expr::Column(_)
| Expr::Columns(_)
| Expr::DtypeColumn(_)
| Expr::Exclude(_, _)
| Expr::Nth(_)
| Expr::IndexColumn(_)
| Expr::Selector(_)
| Expr::Wildcard => true,
Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } if allow_aliasing => {
true
},
_ => false,
})
}

/// Indicate if this expression expands to multiple expressions with regex expansion.
pub fn is_regex_projection(&self) -> bool {
self.0.into_iter().any(|e| match e {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/src/logical_plan/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ impl fmt::Debug for Expr {
Columns(names) => write!(f, "cols({names:?})"),
DtypeColumn(dt) => write!(f, "dtype_columns({dt:?})"),
IndexColumn(idxs) => write!(f, "index_columns({idxs:?})"),
Selector(_) => write!(f, "SELECTOR"),
Selector(_) => write!(f, "selector"),
#[cfg(feature = "dtype-struct")]
Field(names) => write!(f, ".field({names:?})"),
}
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3485,7 +3485,7 @@ def write_database(
"""
Write the data in a Polars DataFrame to a database.
.. versionadded:: 0.20.26
.. versionchanged:: 0.20.26
Support for instantiated connection objects in addition to URI strings, and
a new `engine_options` parameter.
Expand Down
42 changes: 40 additions & 2 deletions py-polars/polars/expr/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def has_multiple_outputs(self) -> bool:
Examples
--------
>>> e = pl.col(["a", "b"]).alias("bar")
>>> e = pl.col(["a", "b"]).name.suffix("_foo")
>>> e.meta.has_multiple_outputs()
True
"""
Expand Down Expand Up @@ -100,12 +100,50 @@ def is_regex_projection(self) -> bool:
Examples
--------
>>> e = pl.col("^.*$").alias("bar")
>>> e = pl.col("^.*$").name.prefix("foo_")
>>> e.meta.is_regex_projection()
True
"""
return self._pyexpr.meta_is_regex_projection()

def is_column_selection(self, *, allow_aliasing: bool = False) -> bool:
"""
Indicate if this expression only selects columns (optionally with aliasing).
This can include bare columns, column matches by regex or dtype, selectors
and exclude ops, and (optionally) column/expression aliasing.
.. versionadded:: 0.20.30
Parameters
----------
allow_aliasing
If False (default), any aliasing is not considered pure column selection.
Set True to allow for column selection that also includes aliasing.
Examples
--------
>>> import polars.selectors as cs
>>> e = pl.col("foo")
>>> e.meta.is_column_selection()
True
>>> e = pl.col("foo").alias("bar")
>>> e.meta.is_column_selection()
False
>>> e.meta.is_column_selection(allow_aliasing=True)
True
>>> e = pl.col("foo") * pl.col("bar")
>>> e.meta.is_column_selection()
False
>>> e = cs.starts_with("foo")
>>> e.meta.is_column_selection()
True
>>> e = cs.starts_with("foo").exclude("foo!")
>>> e.meta.is_column_selection()
True
"""
return self._pyexpr.meta_is_column_selection(allow_aliasing)

@overload
def output_name(self, *, raise_if_undetermined: Literal[True] = True) -> str: ...

Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@ def read_excel(
"""
Read Excel spreadsheet data into a DataFrame.
.. versionadded:: 0.20.6
.. versionchanged:: 0.20.6
Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
.. versionadded:: 0.19.4
.. versionchanged:: 0.19.4
Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb).
.. versionadded:: 0.19.3
.. versionchanged:: 0.19.3
Added "openpyxl" engine, and added `schema_overrides` parameter.
Parameters
Expand Down
40 changes: 30 additions & 10 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,25 @@ def is_selector(obj: Any) -> bool:
def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: SelectorType | Expr,
*,
strict: bool = True,
) -> tuple[str, ...]:
"""
Expand a selector to column names with respect to a specific frame or schema target.
Expand selector to column names, with respect to a specific frame or target schema.
.. versionchanged:: 0.20.30
The `strict` parameter was added.
Parameters
----------
target
A polars DataFrame, LazyFrame or schema.
selector
An arbitrary polars selector (or compound selector).
strict
Setting False will additionally allow for a broader range of column selection
expressions (such as bare columns or use of `.exclude()`) to be expanded, not
just the dedicated selectors.
Examples
--------
Expand Down Expand Up @@ -118,22 +127,33 @@ def expand_selector(
Expand selector with respect to a standalone schema:
>>> schema = {
... "colx": pl.Float32,
... "coly": pl.Float64,
... "colz": pl.Date,
... "id": pl.Int64,
... "desc": pl.String,
... "count": pl.UInt32,
... "value": pl.Float64,
... }
>>> cs.expand_selector(schema, cs.float())
('colx', 'coly')
"""
if not is_selector(selector):
msg = f"expected a selector; found {selector!r} instead."
raise TypeError(msg)
>>> cs.expand_selector(schema, cs.string() | cs.float())
('desc', 'value')
Allow for non-strict selection expressions (such as those
including use of an `.exclude()` constraint) to be expanded:
>>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
('count', 'value')
"""
if isinstance(target, Mapping):
from polars.dataframe import DataFrame

target = DataFrame(schema=target)

if not (
is_selector(selector)
if strict
else selector.meta.is_column_selection(allow_aliasing=False)
):
msg = f"expected a selector; found {selector!r} instead."
raise TypeError(msg)

return tuple(target.select(selector).columns)


Expand Down
7 changes: 7 additions & 0 deletions py-polars/src/expr/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ impl PyExpr {
self.inner.clone().meta().is_regex_projection()
}

fn meta_is_column_selection(&self, allow_aliasing: bool) -> bool {
self.inner
.clone()
.meta()
.is_column_selection(allow_aliasing)
}

fn _meta_selector_add(&self, other: PyExpr) -> PyResult<PyExpr> {
let out = self
.inner
Expand Down
45 changes: 43 additions & 2 deletions py-polars/tests/unit/operations/namespaces/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

import polars as pl
import polars.selectors as cs

if TYPE_CHECKING:
from pathlib import Path
Expand Down Expand Up @@ -65,7 +66,7 @@ def test_undo_aliases() -> None:


def test_meta_has_multiple_outputs() -> None:
e = pl.col(["a", "b"]).alias("bar")
e = pl.col(["a", "b"]).name.suffix("_foo")
assert e.meta.has_multiple_outputs()


Expand All @@ -80,8 +81,48 @@ def test_is_column() -> None:
assert not e.meta.is_column()


@pytest.mark.parametrize(
("expr", "is_column_selection"),
[
# columns
(pl.col("foo"), True),
(pl.col("foo", "bar"), True),
(pl.col(pl.NUMERIC_DTYPES), True),
# column expressions
(pl.col("foo") + 100, False),
(pl.col("foo").floordiv(10), False),
(pl.col("foo") * pl.col("bar"), False),
# selectors / expressions
(cs.numeric() * 100, False),
(cs.temporal() - cs.time(), True),
(cs.numeric().exclude("value"), True),
((cs.temporal() - cs.time()).exclude("dt"), True),
# top-level selection funcs
(pl.nth(2), True),
(pl.first(), True),
(pl.last(), True),
],
)
def test_is_column_selection(
expr: pl.Expr,
is_column_selection: bool,
) -> None:
if is_column_selection:
assert expr.meta.is_column_selection()
assert expr.meta.is_column_selection(allow_aliasing=True)
expr = (
expr.name.suffix("!")
if expr.meta.has_multiple_outputs()
else expr.alias("!")
)
assert not expr.meta.is_column_selection()
assert expr.meta.is_column_selection(allow_aliasing=True)
else:
assert not expr.meta.is_column_selection()


def test_meta_is_regex_projection() -> None:
e = pl.col("^.*$").alias("bar")
e = pl.col("^.*$").name.suffix("_foo")
assert e.meta.is_regex_projection()
assert e.meta.has_multiple_outputs()

Expand Down
24 changes: 24 additions & 0 deletions py-polars/tests/unit/test_selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,30 @@ def test_selector_ends_with(df: pl.DataFrame) -> None:
df.select(cs.ends_with(999)) # type: ignore[arg-type]


def test_selector_expand() -> None:
schema = {
"id": pl.Int64,
"desc": pl.String,
"count": pl.UInt32,
"value": pl.Float64,
}

expanded = cs.expand_selector(schema, cs.numeric() - cs.unsigned_integer())
assert expanded == ("id", "value")

with pytest.raises(TypeError, match="expected a selector"):
cs.expand_selector(schema, pl.exclude("id", "count"))

with pytest.raises(TypeError, match="expected a selector"):
cs.expand_selector(schema, pl.col("value") // 100)

expanded = cs.expand_selector(schema, pl.exclude("id", "count"), strict=False)
assert expanded == ("desc", "value")

expanded = cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
assert expanded == ("count", "value")


def test_selector_first_last(df: pl.DataFrame) -> None:
assert df.select(cs.first()).columns == ["abc"]
assert df.select(cs.last()).columns == ["qqR"]
Expand Down

0 comments on commit cd04f3d

Please sign in to comment.