From 3117ab154e82cc1be60e61ac2e2cee9c00873d42 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 22 Jun 2024 16:16:08 +0100 Subject: [PATCH] feat: Update `DataFrame.pivot` to allow `index=None` when `values` is set (#17126) --- crates/polars-lazy/src/frame/pivot.rs | 4 +- crates/polars-ops/src/frame/pivot/mod.rs | 74 +++++---- crates/polars/tests/it/core/pivot.rs | 24 +-- docs/releases/upgrade/1.md | 9 +- .../rust/user-guide/transformations/pivot.rs | 4 +- py-polars/polars/dataframe/frame.py | 148 ++++++++++-------- py-polars/src/dataframe/general.rs | 2 +- py-polars/tests/unit/operations/test_pivot.py | 12 +- 8 files changed, 161 insertions(+), 116 deletions(-) diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index a3b2acab4bd8..759981c52f0e 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -33,7 +33,7 @@ impl PhysicalAggExpr for PivotExpr { pub fn pivot( df: &DataFrame, on: I0, - index: I1, + index: Option, values: Option, sort_columns: bool, agg_expr: Option, @@ -59,7 +59,7 @@ where pub fn pivot_stable( df: &DataFrame, on: I0, - index: I1, + index: Option, values: Option, sort_columns: bool, agg_expr: Option, diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs index 11a2d0b64248..94c6d33100c8 100644 --- a/crates/polars-ops/src/frame/pivot/mod.rs +++ b/crates/polars-ops/src/frame/pivot/mod.rs @@ -85,7 +85,7 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series { pub fn pivot( pivot_df: &DataFrame, on: I0, - index: I1, + index: Option, values: Option, sort_columns: bool, agg_fn: Option, @@ -99,15 +99,11 @@ where S1: AsRef, S2: AsRef, { - let index = index - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); let on = on .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = get_values_columns(pivot_df, &index, &on, values); + let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?; pivot_impl( pivot_df, &on, @@ -128,7 +124,7 @@ where pub fn pivot_stable( pivot_df: &DataFrame, on: I0, - index: I1, + index: Option, values: Option, sort_columns: bool, agg_fn: Option, @@ -142,15 +138,11 @@ where S1: AsRef, S2: AsRef, { - let index = index - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); let on = on .into_iter() .map(|s| s.as_ref().to_string()) .collect::>(); - let values = get_values_columns(pivot_df, &index, &on, values); + let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?; pivot_impl( pivot_df, &on, @@ -163,28 +155,52 @@ where ) } -/// Determine `values` columns, which is optional in `pivot` calls. +/// Ensure both `index` and `values` are populated with `Vec`. /// -/// If not specified (i.e. is `None`), use all remaining columns in the -/// `DataFrame` after `index` and `columns` have been excluded. -fn get_values_columns( +/// - If `index` is None, assign columns not in `on` and `values` to it. +/// - If `values` is None, assign columns not in `on` and `index` to it. +/// - At least one of `index` and `values` must be non-null. +fn assign_remaining_columns( df: &DataFrame, - index: &[String], on: &[String], - values: Option, -) -> Vec + index: Option, + values: Option, +) -> PolarsResult<(Vec, Vec)> where - I: IntoIterator, - S: AsRef, + I1: IntoIterator, + I2: IntoIterator, + S1: AsRef, + S2: AsRef, { - match values { - Some(v) => v.into_iter().map(|s| s.as_ref().to_string()).collect(), - None => df - .get_column_names() - .into_iter() - .map(|c| c.to_string()) - .filter(|c| !(index.contains(c) | on.contains(c))) - .collect(), + match (index, values) { + (Some(index), Some(values)) => { + let index = index.into_iter().map(|s| s.as_ref().to_string()).collect(); + let values = values.into_iter().map(|s| s.as_ref().to_string()).collect(); + Ok((index, values)) + }, + (Some(index), None) => { + let index: Vec = index.into_iter().map(|s| s.as_ref().to_string()).collect(); + let values = df + .get_column_names() + .into_iter() + .map(|s| s.to_string()) + .filter(|c| !(index.contains(c) | on.contains(c))) + .collect(); + Ok((index, values)) + }, + (None, Some(values)) => { + let values: Vec = values.into_iter().map(|s| s.as_ref().to_string()).collect(); + let index = df + .get_column_names() + .into_iter() + .map(|s| s.to_string()) + .filter(|c| !(values.contains(c) | on.contains(c))) + .collect(); + Ok((index, values)) + }, + (None, None) => { + polars_bail!(InvalidOperation: "`index` and `values` cannot both be None in `pivot` operation") + }, } } diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs index 144c3291a54e..e6e507be3163 100644 --- a/crates/polars/tests/it/core/pivot.rs +++ b/crates/polars/tests/it/core/pivot.rs @@ -16,7 +16,7 @@ fn test_pivot_date_() -> PolarsResult<()> { let out = pivot( &df, ["values1"], - ["index"], + Some(["index"]), Some(["values2"]), true, Some(PivotAgg::Count), @@ -34,7 +34,7 @@ fn test_pivot_date_() -> PolarsResult<()> { let mut out = pivot_stable( &df, ["values2"], - ["index"], + Some(["index"]), Some(["values1"]), true, Some(PivotAgg::First), @@ -64,7 +64,7 @@ fn test_pivot_old() { let pvt = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Sum), @@ -79,7 +79,7 @@ fn test_pivot_old() { let pvt = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Min), @@ -93,7 +93,7 @@ fn test_pivot_old() { let pvt = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Max), @@ -107,7 +107,7 @@ fn test_pivot_old() { let pvt = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Mean), @@ -121,7 +121,7 @@ fn test_pivot_old() { let pvt = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Count), @@ -149,7 +149,7 @@ fn test_pivot_categorical() -> PolarsResult<()> { let out = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), true, Some(PivotAgg::Count), @@ -174,7 +174,7 @@ fn test_pivot_new() -> PolarsResult<()> { let out = (pivot_stable( &df, ["cols1"], - ["index1", "index2"], + Some(["index1", "index2"]), Some(["values1"]), true, Some(PivotAgg::Sum), @@ -191,7 +191,7 @@ fn test_pivot_new() -> PolarsResult<()> { let out = pivot_stable( &df, ["cols1", "cols2"], - ["index1", "index2"], + Some(["index1", "index2"]), Some(["values1"]), true, Some(PivotAgg::Sum), @@ -222,7 +222,7 @@ fn test_pivot_2() -> PolarsResult<()> { let out = pivot_stable( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::First), @@ -255,7 +255,7 @@ fn test_pivot_datetime() -> PolarsResult<()> { let out = pivot( &df, ["columns"], - ["index"], + Some(["index"]), Some(["values"]), false, Some(PivotAgg::Sum), diff --git a/docs/releases/upgrade/1.md b/docs/releases/upgrade/1.md index 0e5664e64e4e..3c0a32176f3d 100644 --- a/docs/releases/upgrade/1.md +++ b/docs/releases/upgrade/1.md @@ -393,7 +393,7 @@ After: ... "test_2": [100, 100, 60, 60], ... } ... ) ->>> df.pivot(index='name', on='subject', values=['test_1', 'test_2']) +>>> df.pivot('subject', index='name') ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐ │ name ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ @@ -404,6 +404,13 @@ After: └───────┴──────────────┴────────────────┴──────────────┴────────────────┘ ``` +Note that the function signature has also changed: + +- `columns` has been renamed to `on`, and is now the first positional argument. +- `index` and `values` are both optional. If `index` is not specified, then it + will use all columns not specified in `on` and `values`. If `values` is + not specified, it will use all columns not specified in `on` and `index`. + ### Support Decimal types by default when converting from Arrow Update conversion from Arrow to always convert Decimals into Polars Decimals, rather than cast to Float64. diff --git a/docs/src/rust/user-guide/transformations/pivot.rs b/docs/src/rust/user-guide/transformations/pivot.rs index 804ead13f056..5072ed82d52c 100644 --- a/docs/src/rust/user-guide/transformations/pivot.rs +++ b/docs/src/rust/user-guide/transformations/pivot.rs @@ -14,7 +14,7 @@ fn main() -> Result<(), Box> { // --8<-- [end:df] // --8<-- [start:eager] - let out = pivot(&df, ["foo"], ["bar"], Some(["N"]), false, None, None)?; + let out = pivot(&df, ["foo"], Some(["bar"]), Some(["N"]), false, None, None)?; println!("{}", &out); // --8<-- [end:eager] @@ -23,7 +23,7 @@ fn main() -> Result<(), Box> { let q2 = pivot( &q.collect()?, ["foo"], - ["bar"], + Some(["bar"]), Some(["N"]), false, None, diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 1ad0fa9ecbf5..cb0e91f8051e 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7586,8 +7586,8 @@ def pivot( self, on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, - index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], - values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, + index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, + values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None, aggregate_function: PivotAgg | Expr | None = None, maintain_order: bool = True, sort_columns: bool = False, @@ -7605,9 +7605,13 @@ def pivot( Name of the column(s) whose values will be used as the header of the output DataFrame. index - One or multiple keys to group by. + One or multiple keys to group by. If None, all remaining columns not specified + on `on` and `values` will be used. At least one of `index` and `values` must + be specified. values - Column values to aggregate. If None, all remaining columns will be used. + One or multiple keys to group by. If None, all remaining columns not specified + on `on` and `index` will be used. At least one of `index` and `values` must + be specified. aggregate_function Choose from: @@ -7620,7 +7624,8 @@ def pivot( sort_columns Sort the transposed columns by name. Default is by order of discovery. separator - Used as separator/delimiter in generated column names in case of multiple value columns. + Used as separator/delimiter in generated column names in case of multiple + `values` columns. Returns ------- @@ -7632,49 +7637,84 @@ def pivot( Examples -------- + You can use `pivot` to reshape a dataframe from "long" to "wide" format. + + For example, suppose we have a dataframe of test scores achieved by some + students, where each row represents a distinct test. + >>> df = pl.DataFrame( ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], + ... "name": ["Cady", "Cady", "Karen", "Karen"], + ... "subject": ["maths", "physics", "maths", "physics"], + ... "test_1": [98, 99, 61, 58], + ... "test_2": [100, 100, 60, 60], ... } ... ) - >>> df.pivot("bar", index="foo", values="baz", aggregate_function="sum") + >>> df + shape: (4, 4) + ┌───────┬─────────┬────────┬────────┐ + │ name ┆ subject ┆ test_1 ┆ test_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪════════╪════════╡ + │ Cady ┆ maths ┆ 98 ┆ 100 │ + │ Cady ┆ physics ┆ 99 ┆ 100 │ + │ Karen ┆ maths ┆ 61 ┆ 60 │ + │ Karen ┆ physics ┆ 58 ┆ 60 │ + └───────┴─────────┴────────┴────────┘ + + Using `pivot`, we can reshape so we have one row per student, with different + subjects as columns, and their `test_1` scores as values: + + >>> df.pivot("subject", index="name", values="test_1") shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ + ┌───────┬───────┬─────────┐ + │ name ┆ maths ┆ physics │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════╪═══════╪═════════╡ + │ Cady ┆ 98 ┆ 99 │ + │ Karen ┆ 61 ┆ 58 │ + └───────┴───────┴─────────┘ - Pivot using selectors to determine the index/values/columns: + You can use selectors too - here we include all test scores in the pivoted table: >>> import polars.selectors as cs - >>> df.pivot( - ... cs.string(), - ... index=cs.string(), - ... values=cs.numeric(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), + >>> df.pivot("subject", values=cs.starts_with("test")) + shape: (2, 5) + ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐ + │ name ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════╪══════════════╪════════════════╪══════════════╪════════════════╡ + │ Cady ┆ 98 ┆ 99 ┆ 100 ┆ 100 │ + │ Karen ┆ 61 ┆ 58 ┆ 60 ┆ 60 │ + └───────┴──────────────┴────────────────┴──────────────┴────────────────┘ + + If you end up with multiple values per cell, you can specify how to aggregate + them with `aggregate_function`: + + >>> df = pl.DataFrame( + ... { + ... "ix": [1, 1, 2, 2, 1, 2], + ... "col": ["a", "a", "a", "a", "b", "b"], + ... "foo": [0, 1, 2, 2, 7, 1], + ... "bar": [0, 2, 0, 0, 9, 4], + ... } ... ) - shape: (4, 6) - ┌─────┬─────┬─────────────┬─────────────┬─────────────┬─────────────┐ - │ foo ┆ bar ┆ {"one","x"} ┆ {"one","y"} ┆ {"two","x"} ┆ {"two","y"} │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════════════╪═════════════╪═════════════╪═════════════╡ - │ one ┆ x ┆ 5 ┆ null ┆ null ┆ null │ - │ one ┆ y ┆ null ┆ 3 ┆ null ┆ null │ - │ two ┆ x ┆ null ┆ null ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ null ┆ null ┆ 3 │ - └─────┴─────┴─────────────┴─────────────┴─────────────┴─────────────┘ - - Run an expression as aggregation function + >>> df.pivot("col", index="ix", aggregate_function="sum") + shape: (2, 5) + ┌─────┬───────┬───────┬───────┬───────┐ + │ ix ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 1 ┆ 7 ┆ 2 ┆ 9 │ + │ 2 ┆ 4 ┆ 1 ┆ 0 ┆ 4 │ + └─────┴───────┴───────┴───────┴───────┘ + + You can also pass a custom aggregation function using + :meth:`polars.element`: >>> df = pl.DataFrame( ... { @@ -7721,38 +7761,12 @@ def pivot( │ a ┆ 0.998347 ┆ null │ │ b ┆ 0.964028 ┆ 0.999954 │ └──────┴──────────┴──────────┘ - - Using a custom `separator` in generated column names: - - >>> df = pl.DataFrame( - ... { - ... "ix": [1, 1, 2, 2, 1, 2], - ... "col": ["a", "a", "a", "a", "b", "b"], - ... "foo": [0, 1, 2, 2, 7, 1], - ... "bar": [0, 2, 0, 0, 9, 4], - ... } - ... ) - >>> df.pivot( - ... "col", - ... index="ix", - ... values=["foo", "bar"], - ... aggregate_function="sum", - ... separator="/", - ... ) - shape: (2, 5) - ┌─────┬───────┬───────┬───────┬───────┐ - │ ix ┆ foo/a ┆ foo/b ┆ bar/a ┆ bar/b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 1 ┆ 7 ┆ 2 ┆ 9 │ - │ 2 ┆ 4 ┆ 1 ┆ 0 ┆ 4 │ - └─────┴───────┴───────┴───────┴───────┘ """ # noqa: W505 - index = _expand_selectors(self, index) on = _expand_selectors(self, on) if values is not None: values = _expand_selectors(self, values) + if index is not None: + index = _expand_selectors(self, index) if isinstance(aggregate_function, str): if aggregate_function == "first": diff --git a/py-polars/src/dataframe/general.rs b/py-polars/src/dataframe/general.rs index b01a0166661f..22199ebb3657 100644 --- a/py-polars/src/dataframe/general.rs +++ b/py-polars/src/dataframe/general.rs @@ -421,7 +421,7 @@ impl PyDataFrame { pub fn pivot_expr( &self, on: Vec, - index: Vec, + index: Option>, values: Option>, maintain_order: bool, sort_columns: bool, diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py index ca92aeb48434..dced58df5618 100644 --- a/py-polars/tests/unit/operations/test_pivot.py +++ b/py-polars/tests/unit/operations/test_pivot.py @@ -22,7 +22,7 @@ def test_pivot() -> None: "N": [1, 2, 2, 4, 2], } ) - result = df.pivot(index="foo", on="bar", values="N", aggregate_function=None) + result = df.pivot("bar", values="N", aggregate_function=None) expected = pl.DataFrame( [ @@ -45,7 +45,7 @@ def test_pivot_no_values() -> None: "N2": [1, 2, 2, 4, 2], } ) - result = df.pivot(index="foo", on="bar", values=None, aggregate_function=None) + result = df.pivot(on="bar", index="foo", aggregate_function=None) expected = pl.DataFrame( { "foo": ["A", "B", "C"], @@ -523,3 +523,11 @@ def test_pivot_string_17081() -> None: "5": [None, "8", None], "6": [None, None, "9"], } + + +def test_pivot_invalid() -> None: + with pytest.raises( + pl.exceptions.InvalidOperationError, + match="`index` and `values` cannot both be None in `pivot` operation", + ): + pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")