From 3117ab154e82cc1be60e61ac2e2cee9c00873d42 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Sat, 22 Jun 2024 16:16:08 +0100
Subject: [PATCH] feat: Update `DataFrame.pivot` to allow `index=None` when
 `values` is set (#17126)

---
 crates/polars-lazy/src/frame/pivot.rs         |   4 +-
 crates/polars-ops/src/frame/pivot/mod.rs      |  74 +++++----
 crates/polars/tests/it/core/pivot.rs          |  24 +--
 docs/releases/upgrade/1.md                    |   9 +-
 .../rust/user-guide/transformations/pivot.rs  |   4 +-
 py-polars/polars/dataframe/frame.py           | 148 ++++++++++--------
 py-polars/src/dataframe/general.rs            |   2 +-
 py-polars/tests/unit/operations/test_pivot.py |  12 +-
 8 files changed, 161 insertions(+), 116 deletions(-)
diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs
index a3b2acab4bd8..759981c52f0e 100644
--- a/crates/polars-lazy/src/frame/pivot.rs
+++ b/crates/polars-lazy/src/frame/pivot.rs
@@ -33,7 +33,7 @@ impl PhysicalAggExpr for PivotExpr {
 pub fn pivot<I0, I1, I2, S0, S1, S2>(
     df: &DataFrame,
     on: I0,
-    index: I1,
+    index: Option<I1>,
     values: Option<I2>,
     sort_columns: bool,
     agg_expr: Option<Expr>,
@@ -59,7 +59,7 @@ where
 pub fn pivot_stable<I0, I1, I2, S0, S1, S2>(
     df: &DataFrame,
     on: I0,
-    index: I1,
+    index: Option<I1>,
     values: Option<I2>,
     sort_columns: bool,
     agg_expr: Option<Expr>,
diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs
index 11a2d0b64248..94c6d33100c8 100644
--- a/crates/polars-ops/src/frame/pivot/mod.rs
+++ b/crates/polars-ops/src/frame/pivot/mod.rs
@@ -85,7 +85,7 @@ fn restore_logical_type(s: &Series, logical_type: &DataType) -> Series {
 pub fn pivot<I0, I1, I2, S0, S1, S2>(
     pivot_df: &DataFrame,
     on: I0,
-    index: I1,
+    index: Option<I1>,
     values: Option<I2>,
     sort_columns: bool,
     agg_fn: Option<PivotAgg>,
@@ -99,15 +99,11 @@ where
     S1: AsRef<str>,
     S2: AsRef<str>,
 {
-    let index = index
-        .into_iter()
-        .map(|s| s.as_ref().to_string())
-        .collect::<Vec<_>>();
     let on = on
         .into_iter()
         .map(|s| s.as_ref().to_string())
         .collect::<Vec<_>>();
-    let values = get_values_columns(pivot_df, &index, &on, values);
+    let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?;
     pivot_impl(
         pivot_df,
         &on,
@@ -128,7 +124,7 @@ where
 pub fn pivot_stable<I0, I1, I2, S0, S1, S2>(
     pivot_df: &DataFrame,
     on: I0,
-    index: I1,
+    index: Option<I1>,
     values: Option<I2>,
     sort_columns: bool,
     agg_fn: Option<PivotAgg>,
@@ -142,15 +138,11 @@ where
     S1: AsRef<str>,
     S2: AsRef<str>,
 {
-    let index = index
-        .into_iter()
-        .map(|s| s.as_ref().to_string())
-        .collect::<Vec<_>>();
     let on = on
         .into_iter()
         .map(|s| s.as_ref().to_string())
         .collect::<Vec<_>>();
-    let values = get_values_columns(pivot_df, &index, &on, values);
+    let (index, values) = assign_remaining_columns(pivot_df, &on, index, values)?;
     pivot_impl(
         pivot_df,
         &on,
@@ -163,28 +155,52 @@ where
     )
 }
 
-/// Determine `values` columns, which is optional in `pivot` calls.
+/// Ensure both `index` and `values` are populated with `Vec<String>`.
 ///
-/// If not specified (i.e. is `None`), use all remaining columns in the
-/// `DataFrame` after `index` and `columns` have been excluded.
-fn get_values_columns<I, S>(
+/// - If `index` is None, assign columns not in `on` and `values` to it.
+/// - If `values` is None, assign columns not in `on` and `index` to it.
+/// - At least one of `index` and `values` must be non-null.
+fn assign_remaining_columns<I1, I2, S1, S2>(
     df: &DataFrame,
-    index: &[String],
     on: &[String],
-    values: Option<I>,
-) -> Vec<String>
+    index: Option<I1>,
+    values: Option<I2>,
+) -> PolarsResult<(Vec<String>, Vec<String>)>
 where
-    I: IntoIterator<Item = S>,
-    S: AsRef<str>,
+    I1: IntoIterator<Item = S1>,
+    I2: IntoIterator<Item = S2>,
+    S1: AsRef<str>,
+    S2: AsRef<str>,
 {
-    match values {
-        Some(v) => v.into_iter().map(|s| s.as_ref().to_string()).collect(),
-        None => df
-            .get_column_names()
-            .into_iter()
-            .map(|c| c.to_string())
-            .filter(|c| !(index.contains(c) | on.contains(c)))
-            .collect(),
+    match (index, values) {
+        (Some(index), Some(values)) => {
+            let index = index.into_iter().map(|s| s.as_ref().to_string()).collect();
+            let values = values.into_iter().map(|s| s.as_ref().to_string()).collect();
+            Ok((index, values))
+        },
+        (Some(index), None) => {
+            let index: Vec<String> = index.into_iter().map(|s| s.as_ref().to_string()).collect();
+            let values = df
+                .get_column_names()
+                .into_iter()
+                .map(|s| s.to_string())
+                .filter(|c| !(index.contains(c) | on.contains(c)))
+                .collect();
+            Ok((index, values))
+        },
+        (None, Some(values)) => {
+            let values: Vec<String> = values.into_iter().map(|s| s.as_ref().to_string()).collect();
+            let index = df
+                .get_column_names()
+                .into_iter()
+                .map(|s| s.to_string())
+                .filter(|c| !(values.contains(c) | on.contains(c)))
+                .collect();
+            Ok((index, values))
+        },
+        (None, None) => {
+            polars_bail!(InvalidOperation: "`index` and `values` cannot both be None in `pivot` operation")
+        },
     }
 }
 
diff --git a/crates/polars/tests/it/core/pivot.rs b/crates/polars/tests/it/core/pivot.rs
index 144c3291a54e..e6e507be3163 100644
--- a/crates/polars/tests/it/core/pivot.rs
+++ b/crates/polars/tests/it/core/pivot.rs
@@ -16,7 +16,7 @@ fn test_pivot_date_() -> PolarsResult<()> {
     let out = pivot(
         &df,
         ["values1"],
-        ["index"],
+        Some(["index"]),
         Some(["values2"]),
         true,
         Some(PivotAgg::Count),
@@ -34,7 +34,7 @@ fn test_pivot_date_() -> PolarsResult<()> {
     let mut out = pivot_stable(
         &df,
         ["values2"],
-        ["index"],
+        Some(["index"]),
         Some(["values1"]),
         true,
         Some(PivotAgg::First),
@@ -64,7 +64,7 @@ fn test_pivot_old() {
     let pvt = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Sum),
@@ -79,7 +79,7 @@ fn test_pivot_old() {
     let pvt = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Min),
@@ -93,7 +93,7 @@ fn test_pivot_old() {
     let pvt = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Max),
@@ -107,7 +107,7 @@ fn test_pivot_old() {
     let pvt = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Mean),
@@ -121,7 +121,7 @@ fn test_pivot_old() {
     let pvt = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Count),
@@ -149,7 +149,7 @@ fn test_pivot_categorical() -> PolarsResult<()> {
     let out = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         true,
         Some(PivotAgg::Count),
@@ -174,7 +174,7 @@ fn test_pivot_new() -> PolarsResult<()> {
     let out = (pivot_stable(
         &df,
         ["cols1"],
-        ["index1", "index2"],
+        Some(["index1", "index2"]),
         Some(["values1"]),
         true,
         Some(PivotAgg::Sum),
@@ -191,7 +191,7 @@ fn test_pivot_new() -> PolarsResult<()> {
     let out = pivot_stable(
         &df,
         ["cols1", "cols2"],
-        ["index1", "index2"],
+        Some(["index1", "index2"]),
         Some(["values1"]),
         true,
         Some(PivotAgg::Sum),
@@ -222,7 +222,7 @@ fn test_pivot_2() -> PolarsResult<()> {
     let out = pivot_stable(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::First),
@@ -255,7 +255,7 @@ fn test_pivot_datetime() -> PolarsResult<()> {
     let out = pivot(
         &df,
         ["columns"],
-        ["index"],
+        Some(["index"]),
         Some(["values"]),
         false,
         Some(PivotAgg::Sum),
diff --git a/docs/releases/upgrade/1.md b/docs/releases/upgrade/1.md
index 0e5664e64e4e..3c0a32176f3d 100644
--- a/docs/releases/upgrade/1.md
+++ b/docs/releases/upgrade/1.md
@@ -393,7 +393,7 @@ After:
 ...         "test_2": [100, 100, 60, 60],
 ...     }
 ... )
->>> df.pivot(index='name', on='subject', values=['test_1', 'test_2'])
+>>> df.pivot('subject', index='name')
 ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐
 │ name  ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │
 │ ---   ┆ ---          ┆ ---            ┆ ---          ┆ ---            │
@@ -404,6 +404,13 @@ After:
 └───────┴──────────────┴────────────────┴──────────────┴────────────────┘
 ```
 
+Note that the function signature has also changed:
+
+- `columns` has been renamed to `on`, and is now the first positional argument.
+- `index` and `values` are both optional. If `index` is not specified, then it
+  will use all columns not specified in `on` and `values`. If `values` is
+  not specified, it will use all columns not specified in `on` and `index`.
+
 ### Support Decimal types by default when converting from Arrow
 
 Update conversion from Arrow to always convert Decimals into Polars Decimals, rather than cast to Float64.
diff --git a/docs/src/rust/user-guide/transformations/pivot.rs b/docs/src/rust/user-guide/transformations/pivot.rs
index 804ead13f056..5072ed82d52c 100644
--- a/docs/src/rust/user-guide/transformations/pivot.rs
+++ b/docs/src/rust/user-guide/transformations/pivot.rs
@@ -14,7 +14,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // --8<-- [end:df]
 
     // --8<-- [start:eager]
-    let out = pivot(&df, ["foo"], ["bar"], Some(["N"]), false, None, None)?;
+    let out = pivot(&df, ["foo"], Some(["bar"]), Some(["N"]), false, None, None)?;
     println!("{}", &out);
     // --8<-- [end:eager]
 
@@ -23,7 +23,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let q2 = pivot(
         &q.collect()?,
         ["foo"],
-        ["bar"],
+        Some(["bar"]),
         Some(["N"]),
         false,
         None,
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 1ad0fa9ecbf5..cb0e91f8051e 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -7586,8 +7586,8 @@ def pivot(
         self,
         on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
         *,
-        index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
-        values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None,
+        index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
+        values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
         aggregate_function: PivotAgg | Expr | None = None,
         maintain_order: bool = True,
         sort_columns: bool = False,
@@ -7605,9 +7605,13 @@ def pivot(
             Name of the column(s) whose values will be used as the header of the output
             DataFrame.
         index
-            One or multiple keys to group by.
+            One or multiple keys to group by. If None, all remaining columns not specified
+            on `on` and `values` will be used. At least one of `index` and `values` must
+            be specified.
         values
-            Column values to aggregate. If None, all remaining columns will be used.
+            One or multiple keys to group by. If None, all remaining columns not specified
+            on `on` and `index` will be used. At least one of `index` and `values` must
+            be specified.
         aggregate_function
             Choose from:
 
@@ -7620,7 +7624,8 @@ def pivot(
         sort_columns
             Sort the transposed columns by name. Default is by order of discovery.
         separator
-            Used as separator/delimiter in generated column names in case of multiple value columns.
+            Used as separator/delimiter in generated column names in case of multiple
+            `values` columns.
 
         Returns
         -------
@@ -7632,49 +7637,84 @@ def pivot(
 
         Examples
         --------
+        You can use `pivot` to reshape a dataframe from "long" to "wide" format.
+
+        For example, suppose we have a dataframe of test scores achieved by some
+        students, where each row represents a distinct test.
+
         >>> df = pl.DataFrame(
         ...     {
-        ...         "foo": ["one", "one", "two", "two", "one", "two"],
-        ...         "bar": ["y", "y", "y", "x", "x", "x"],
-        ...         "baz": [1, 2, 3, 4, 5, 6],
+        ...         "name": ["Cady", "Cady", "Karen", "Karen"],
+        ...         "subject": ["maths", "physics", "maths", "physics"],
+        ...         "test_1": [98, 99, 61, 58],
+        ...         "test_2": [100, 100, 60, 60],
         ...     }
         ... )
-        >>> df.pivot("bar", index="foo", values="baz", aggregate_function="sum")
+        >>> df
+        shape: (4, 4)
+        ┌───────┬─────────┬────────┬────────┐
+        │ name  ┆ subject ┆ test_1 ┆ test_2 │
+        │ ---   ┆ ---     ┆ ---    ┆ ---    │
+        │ str   ┆ str     ┆ i64    ┆ i64    │
+        ╞═══════╪═════════╪════════╪════════╡
+        │ Cady  ┆ maths   ┆ 98     ┆ 100    │
+        │ Cady  ┆ physics ┆ 99     ┆ 100    │
+        │ Karen ┆ maths   ┆ 61     ┆ 60     │
+        │ Karen ┆ physics ┆ 58     ┆ 60     │
+        └───────┴─────────┴────────┴────────┘
+
+        Using `pivot`, we can reshape so we have one row per student, with different
+        subjects as columns, and their `test_1` scores as values:
+
+        >>> df.pivot("subject", index="name", values="test_1")
         shape: (2, 3)
-        ┌─────┬─────┬─────┐
-        │ foo ┆ y   ┆ x   │
-        │ --- ┆ --- ┆ --- │
-        │ str ┆ i64 ┆ i64 │
-        ╞═════╪═════╪═════╡
-        │ one ┆ 3   ┆ 5   │
-        │ two ┆ 3   ┆ 10  │
-        └─────┴─────┴─────┘
+        ┌───────┬───────┬─────────┐
+        │ name  ┆ maths ┆ physics │
+        │ ---   ┆ ---   ┆ ---     │
+        │ str   ┆ i64   ┆ i64     │
+        ╞═══════╪═══════╪═════════╡
+        │ Cady  ┆ 98    ┆ 99      │
+        │ Karen ┆ 61    ┆ 58      │
+        └───────┴───────┴─────────┘
 
-        Pivot using selectors to determine the index/values/columns:
+        You can use selectors too - here we include all test scores in the pivoted table:
 
         >>> import polars.selectors as cs
-        >>> df.pivot(
-        ...     cs.string(),
-        ...     index=cs.string(),
-        ...     values=cs.numeric(),
-        ...     aggregate_function="sum",
-        ...     sort_columns=True,
-        ... ).sort(
-        ...     by=cs.string(),
+        >>> df.pivot("subject", values=cs.starts_with("test"))
+        shape: (2, 5)
+        ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐
+        │ name  ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │
+        │ ---   ┆ ---          ┆ ---            ┆ ---          ┆ ---            │
+        │ str   ┆ i64          ┆ i64            ┆ i64          ┆ i64            │
+        ╞═══════╪══════════════╪════════════════╪══════════════╪════════════════╡
+        │ Cady  ┆ 98           ┆ 99             ┆ 100          ┆ 100            │
+        │ Karen ┆ 61           ┆ 58             ┆ 60           ┆ 60             │
+        └───────┴──────────────┴────────────────┴──────────────┴────────────────┘
+
+        If you end up with multiple values per cell, you can specify how to aggregate
+        them with `aggregate_function`:
+
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "ix": [1, 1, 2, 2, 1, 2],
+        ...         "col": ["a", "a", "a", "a", "b", "b"],
+        ...         "foo": [0, 1, 2, 2, 7, 1],
+        ...         "bar": [0, 2, 0, 0, 9, 4],
+        ...     }
         ... )
-        shape: (4, 6)
-        ┌─────┬─────┬─────────────┬─────────────┬─────────────┬─────────────┐
-        │ foo ┆ bar ┆ {"one","x"} ┆ {"one","y"} ┆ {"two","x"} ┆ {"two","y"} │
-        │ --- ┆ --- ┆ ---         ┆ ---         ┆ ---         ┆ ---         │
-        │ str ┆ str ┆ i64         ┆ i64         ┆ i64         ┆ i64         │
-        ╞═════╪═════╪═════════════╪═════════════╪═════════════╪═════════════╡
-        │ one ┆ x   ┆ 5           ┆ null        ┆ null        ┆ null        │
-        │ one ┆ y   ┆ null        ┆ 3           ┆ null        ┆ null        │
-        │ two ┆ x   ┆ null        ┆ null        ┆ 10          ┆ null        │
-        │ two ┆ y   ┆ null        ┆ null        ┆ null        ┆ 3           │
-        └─────┴─────┴─────────────┴─────────────┴─────────────┴─────────────┘
-
-        Run an expression as aggregation function
+        >>> df.pivot("col", index="ix", aggregate_function="sum")
+        shape: (2, 5)
+        ┌─────┬───────┬───────┬───────┬───────┐
+        │ ix  ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
+        │ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
+        │ i64 ┆ i64   ┆ i64   ┆ i64   ┆ i64   │
+        ╞═════╪═══════╪═══════╪═══════╪═══════╡
+        │ 1   ┆ 1     ┆ 7     ┆ 2     ┆ 9     │
+        │ 2   ┆ 4     ┆ 1     ┆ 0     ┆ 4     │
+        └─────┴───────┴───────┴───────┴───────┘
+
+        You can also pass a custom aggregation function using
+        :meth:`polars.element`:
 
         >>> df = pl.DataFrame(
         ...     {
@@ -7721,38 +7761,12 @@ def pivot(
         │ a    ┆ 0.998347 ┆ null     │
         │ b    ┆ 0.964028 ┆ 0.999954 │
         └──────┴──────────┴──────────┘
-
-        Using a custom `separator` in generated column names:
-
-        >>> df = pl.DataFrame(
-        ...     {
-        ...         "ix": [1, 1, 2, 2, 1, 2],
-        ...         "col": ["a", "a", "a", "a", "b", "b"],
-        ...         "foo": [0, 1, 2, 2, 7, 1],
-        ...         "bar": [0, 2, 0, 0, 9, 4],
-        ...     }
-        ... )
-        >>> df.pivot(
-        ...     "col",
-        ...     index="ix",
-        ...     values=["foo", "bar"],
-        ...     aggregate_function="sum",
-        ...     separator="/",
-        ... )
-        shape: (2, 5)
-        ┌─────┬───────┬───────┬───────┬───────┐
-        │ ix  ┆ foo/a ┆ foo/b ┆ bar/a ┆ bar/b │
-        │ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
-        │ i64 ┆ i64   ┆ i64   ┆ i64   ┆ i64   │
-        ╞═════╪═══════╪═══════╪═══════╪═══════╡
-        │ 1   ┆ 1     ┆ 7     ┆ 2     ┆ 9     │
-        │ 2   ┆ 4     ┆ 1     ┆ 0     ┆ 4     │
-        └─────┴───────┴───────┴───────┴───────┘
         """  # noqa: W505
-        index = _expand_selectors(self, index)
         on = _expand_selectors(self, on)
         if values is not None:
             values = _expand_selectors(self, values)
+        if index is not None:
+            index = _expand_selectors(self, index)
 
         if isinstance(aggregate_function, str):
             if aggregate_function == "first":
diff --git a/py-polars/src/dataframe/general.rs b/py-polars/src/dataframe/general.rs
index b01a0166661f..22199ebb3657 100644
--- a/py-polars/src/dataframe/general.rs
+++ b/py-polars/src/dataframe/general.rs
@@ -421,7 +421,7 @@ impl PyDataFrame {
     pub fn pivot_expr(
         &self,
         on: Vec<String>,
-        index: Vec<String>,
+        index: Option<Vec<String>>,
         values: Option<Vec<String>>,
         maintain_order: bool,
         sort_columns: bool,
diff --git a/py-polars/tests/unit/operations/test_pivot.py b/py-polars/tests/unit/operations/test_pivot.py
index ca92aeb48434..dced58df5618 100644
--- a/py-polars/tests/unit/operations/test_pivot.py
+++ b/py-polars/tests/unit/operations/test_pivot.py
@@ -22,7 +22,7 @@ def test_pivot() -> None:
             "N": [1, 2, 2, 4, 2],
         }
     )
-    result = df.pivot(index="foo", on="bar", values="N", aggregate_function=None)
+    result = df.pivot("bar", values="N", aggregate_function=None)
 
     expected = pl.DataFrame(
         [
@@ -45,7 +45,7 @@ def test_pivot_no_values() -> None:
             "N2": [1, 2, 2, 4, 2],
         }
     )
-    result = df.pivot(index="foo", on="bar", values=None, aggregate_function=None)
+    result = df.pivot(on="bar", index="foo", aggregate_function=None)
     expected = pl.DataFrame(
         {
             "foo": ["A", "B", "C"],
@@ -523,3 +523,11 @@ def test_pivot_string_17081() -> None:
         "5": [None, "8", None],
         "6": [None, None, "9"],
     }
+
+
+def test_pivot_invalid() -> None:
+    with pytest.raises(
+        pl.exceptions.InvalidOperationError,
+        match="`index` and `values` cannot both be None in `pivot` operation",
+    ):
+        pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")