diff --git a/crates/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs index 19ae650f1e52..f5b67fcf63bf 100644 --- a/crates/polars-plan/src/dsl/meta.rs +++ b/crates/polars-plan/src/dsl/meta.rs @@ -68,6 +68,26 @@ impl MetaNameSpace { } } + /// Indicate if this expression only selects columns; the presence of any + /// transform operations will cause the check to return `false`, though + /// aliasing of the selected columns is optionally allowed. + pub fn is_column_selection(&self, allow_aliasing: bool) -> bool { + self.0.into_iter().all(|e| match e { + Expr::Column(_) + | Expr::Columns(_) + | Expr::DtypeColumn(_) + | Expr::Exclude(_, _) + | Expr::Nth(_) + | Expr::IndexColumn(_) + | Expr::Selector(_) + | Expr::Wildcard => true, + Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } if allow_aliasing => { + true + }, + _ => false, + }) + } + /// Indicate if this expression expands to multiple expressions with regex expansion. pub fn is_regex_projection(&self) -> bool { self.0.into_iter().any(|e| match e { diff --git a/crates/polars-plan/src/logical_plan/format.rs b/crates/polars-plan/src/logical_plan/format.rs index 2118140b2d26..4c7b46b01969 100644 --- a/crates/polars-plan/src/logical_plan/format.rs +++ b/crates/polars-plan/src/logical_plan/format.rs @@ -164,7 +164,7 @@ impl fmt::Debug for Expr { Columns(names) => write!(f, "cols({names:?})"), DtypeColumn(dt) => write!(f, "dtype_columns({dt:?})"), IndexColumn(idxs) => write!(f, "index_columns({idxs:?})"), - Selector(_) => write!(f, "SELECTOR"), + Selector(_) => write!(f, "selector"), #[cfg(feature = "dtype-struct")] Field(names) => write!(f, ".field({names:?})"), } diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 2437146fd82b..0d40bba318f6 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3485,7 +3485,7 @@ def write_database( """ Write the data in a Polars DataFrame to a database. - .. versionadded:: 0.20.26 + .. versionchanged:: 0.20.26 Support for instantiated connection objects in addition to URI strings, and a new `engine_options` parameter. diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 775405e2594c..ec531dbd8ca2 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -70,7 +70,7 @@ def has_multiple_outputs(self) -> bool: Examples -------- - >>> e = pl.col(["a", "b"]).alias("bar") + >>> e = pl.col(["a", "b"]).name.suffix("_foo") >>> e.meta.has_multiple_outputs() True """ @@ -100,12 +100,50 @@ def is_regex_projection(self) -> bool: Examples -------- - >>> e = pl.col("^.*$").alias("bar") + >>> e = pl.col("^.*$").name.prefix("foo_") >>> e.meta.is_regex_projection() True """ return self._pyexpr.meta_is_regex_projection() + def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: + """ + Indicate if this expression only selects columns (optionally with aliasing). + + This can include bare columns, column matches by regex or dtype, selectors + and exclude ops, and (optionally) column/expression aliasing. + + .. versionadded:: 0.20.30 + + Parameters + ---------- + allow_aliasing + If False (default), any aliasing is not considered pure column selection. + Set True to allow for column selection that also includes aliasing. + + Examples + -------- + >>> import polars.selectors as cs + >>> e = pl.col("foo") + >>> e.meta.is_column_selection() + True + >>> e = pl.col("foo").alias("bar") + >>> e.meta.is_column_selection() + False + >>> e.meta.is_column_selection(allow_aliasing=True) + True + >>> e = pl.col("foo") * pl.col("bar") + >>> e.meta.is_column_selection() + False + >>> e = cs.starts_with("foo") + >>> e.meta.is_column_selection() + True + >>> e = cs.starts_with("foo").exclude("foo!") + >>> e.meta.is_column_selection() + True + """ + return self._pyexpr.meta_is_column_selection(allow_aliasing) + @overload def output_name(self, *, raise_if_undetermined: Literal[True] = True) -> str: ... diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 4b3a143f865b..37823d52977b 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -150,11 +150,11 @@ def read_excel( """ Read Excel spreadsheet data into a DataFrame. - .. versionadded:: 0.20.6 + .. versionchanged:: 0.20.6 Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls). - .. versionadded:: 0.19.4 + .. versionchanged:: 0.19.4 Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb). - .. versionadded:: 0.19.3 + .. versionchanged:: 0.19.3 Added "openpyxl" engine, and added `schema_overrides` parameter. Parameters diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index ddc94fe4aa45..6e2e327f3b55 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -81,9 +81,14 @@ def is_selector(obj: Any) -> bool: def expand_selector( target: DataFrame | LazyFrame | Mapping[str, PolarsDataType], selector: SelectorType | Expr, + *, + strict: bool = True, ) -> tuple[str, ...]: """ - Expand a selector to column names with respect to a specific frame or schema target. + Expand selector to column names, with respect to a specific frame or target schema. + + .. versionchanged:: 0.20.30 + The `strict` parameter was added. Parameters ---------- @@ -91,6 +96,10 @@ def expand_selector( A polars DataFrame, LazyFrame or schema. selector An arbitrary polars selector (or compound selector). + strict + Setting False will additionally allow for a broader range of column selection + expressions (such as bare columns or use of `.exclude()`) to be expanded, not + just the dedicated selectors. Examples -------- @@ -118,22 +127,33 @@ def expand_selector( Expand selector with respect to a standalone schema: >>> schema = { - ... "colx": pl.Float32, - ... "coly": pl.Float64, - ... "colz": pl.Date, + ... "id": pl.Int64, + ... "desc": pl.String, + ... "count": pl.UInt32, + ... "value": pl.Float64, ... } - >>> cs.expand_selector(schema, cs.float()) - ('colx', 'coly') - """ - if not is_selector(selector): - msg = f"expected a selector; found {selector!r} instead." - raise TypeError(msg) + >>> cs.expand_selector(schema, cs.string() | cs.float()) + ('desc', 'value') + + Allow for non-strict selection expressions (such as those + including use of an `.exclude()` constraint) to be expanded: + >>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False) + ('count', 'value') + """ if isinstance(target, Mapping): from polars.dataframe import DataFrame target = DataFrame(schema=target) + if not ( + is_selector(selector) + if strict + else selector.meta.is_column_selection(allow_aliasing=False) + ): + msg = f"expected a selector; found {selector!r} instead." + raise TypeError(msg) + return tuple(target.select(selector).columns) diff --git a/py-polars/src/expr/meta.rs b/py-polars/src/expr/meta.rs index 225d8573d281..4e2a41b09769 100644 --- a/py-polars/src/expr/meta.rs +++ b/py-polars/src/expr/meta.rs @@ -54,6 +54,13 @@ impl PyExpr { self.inner.clone().meta().is_regex_projection() } + fn meta_is_column_selection(&self, allow_aliasing: bool) -> bool { + self.inner + .clone() + .meta() + .is_column_selection(allow_aliasing) + } + fn _meta_selector_add(&self, other: PyExpr) -> PyResult { let out = self .inner diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index fe554c694491..386c34ac03a9 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -5,6 +5,7 @@ import pytest import polars as pl +import polars.selectors as cs if TYPE_CHECKING: from pathlib import Path @@ -65,7 +66,7 @@ def test_undo_aliases() -> None: def test_meta_has_multiple_outputs() -> None: - e = pl.col(["a", "b"]).alias("bar") + e = pl.col(["a", "b"]).name.suffix("_foo") assert e.meta.has_multiple_outputs() @@ -80,8 +81,48 @@ def test_is_column() -> None: assert not e.meta.is_column() +@pytest.mark.parametrize( + ("expr", "is_column_selection"), + [ + # columns + (pl.col("foo"), True), + (pl.col("foo", "bar"), True), + (pl.col(pl.NUMERIC_DTYPES), True), + # column expressions + (pl.col("foo") + 100, False), + (pl.col("foo").floordiv(10), False), + (pl.col("foo") * pl.col("bar"), False), + # selectors / expressions + (cs.numeric() * 100, False), + (cs.temporal() - cs.time(), True), + (cs.numeric().exclude("value"), True), + ((cs.temporal() - cs.time()).exclude("dt"), True), + # top-level selection funcs + (pl.nth(2), True), + (pl.first(), True), + (pl.last(), True), + ], +) +def test_is_column_selection( + expr: pl.Expr, + is_column_selection: bool, +) -> None: + if is_column_selection: + assert expr.meta.is_column_selection() + assert expr.meta.is_column_selection(allow_aliasing=True) + expr = ( + expr.name.suffix("!") + if expr.meta.has_multiple_outputs() + else expr.alias("!") + ) + assert not expr.meta.is_column_selection() + assert expr.meta.is_column_selection(allow_aliasing=True) + else: + assert not expr.meta.is_column_selection() + + def test_meta_is_regex_projection() -> None: - e = pl.col("^.*$").alias("bar") + e = pl.col("^.*$").name.suffix("_foo") assert e.meta.is_regex_projection() assert e.meta.has_multiple_outputs() diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index d694d91dc8e6..d8cf3900c39d 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -376,6 +376,30 @@ def test_selector_ends_with(df: pl.DataFrame) -> None: df.select(cs.ends_with(999)) # type: ignore[arg-type] +def test_selector_expand() -> None: + schema = { + "id": pl.Int64, + "desc": pl.String, + "count": pl.UInt32, + "value": pl.Float64, + } + + expanded = cs.expand_selector(schema, cs.numeric() - cs.unsigned_integer()) + assert expanded == ("id", "value") + + with pytest.raises(TypeError, match="expected a selector"): + cs.expand_selector(schema, pl.exclude("id", "count")) + + with pytest.raises(TypeError, match="expected a selector"): + cs.expand_selector(schema, pl.col("value") // 100) + + expanded = cs.expand_selector(schema, pl.exclude("id", "count"), strict=False) + assert expanded == ("desc", "value") + + expanded = cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False) + assert expanded == ("count", "value") + + def test_selector_first_last(df: pl.DataFrame) -> None: assert df.select(cs.first()).columns == ["abc"] assert df.select(cs.last()).columns == ["qqR"]