From fb126742bff075fc5274ce056c25c6cd6a426733 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 23 May 2024 13:38:32 +0400 Subject: [PATCH] feat: Allow designation of a custom name for the `value_counts` "count" column --- crates/polars-ops/src/series/ops/various.rs | 16 ++++++---- .../src/dsl/function_expr/dispatch.rs | 9 ++++-- .../polars-plan/src/dsl/function_expr/mod.rs | 14 +++++++-- .../src/dsl/function_expr/schema.rs | 9 ++++-- crates/polars-plan/src/dsl/mod.rs | 16 ++++++---- .../rust/user-guide/expressions/structs.rs | 4 +-- py-polars/polars/expr/expr.py | 27 +++++++++++++---- py-polars/polars/series/series.py | 30 +++++++++++-------- py-polars/src/expr/general.rs | 4 +-- py-polars/src/lazyframe/visitor/expr_nodes.rs | 1 + py-polars/src/series/mod.rs | 9 ++++-- .../unit/operations/test_value_counts.py | 19 +++++++++--- 12 files changed, 113 insertions(+), 45 deletions(-) diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs index 5d67e38b2fcd..ecf5feb2f83c 100644 --- a/crates/polars-ops/src/series/ops/various.rs +++ b/crates/polars-ops/src/series/ops/various.rs @@ -11,21 +11,27 @@ use crate::series::ops::SeriesSealed; pub trait SeriesMethods: SeriesSealed { /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"` /// with dtype [`IdxType`] - fn value_counts(&self, sort: bool, parallel: bool) -> PolarsResult { + fn value_counts( + &self, + sort: bool, + parallel: bool, + name: Option, + ) -> PolarsResult { + let name = name.unwrap_or("count".to_string()); let s = self.as_series(); polars_ensure!( - s.name() != "count", - Duplicate: "using `value_counts` on a column named 'count' would lead to duplicate column names" + s.name() != name, + Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate column names; change `name` to fix", name, ); // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined let groups = s.group_tuples(parallel, sort)?; let values = unsafe { s.agg_first(&groups) }; - let counts = groups.group_count().with_name("count"); + let counts = groups.group_count().with_name(name.as_str()); let cols = vec![values, counts.into_series()]; let df = unsafe { DataFrame::new_no_checks(cols) }; if sort { df.sort( - ["count"], + [name], SortMultipleOptions::default() .with_order_descending(true) .with_multithreaded(parallel), diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index dd7c6eba0f86..56541b9923bd 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -54,8 +54,13 @@ pub(super) fn replace_time_zone( } #[cfg(feature = "dtype-struct")] -pub(super) fn value_counts(s: &Series, sort: bool, parallel: bool) -> PolarsResult { - s.value_counts(sort, parallel) +pub(super) fn value_counts( + s: &Series, + sort: bool, + parallel: bool, + name: Option, +) -> PolarsResult { + s.value_counts(sort, parallel, name) .map(|df| df.into_struct(s.name()).into_series()) } diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 5909779440a9..492d52e447a3 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -221,6 +221,7 @@ pub enum FunctionExpr { ValueCounts { sort: bool, parallel: bool, + name: Option, }, #[cfg(feature = "unique_counts")] UniqueCounts, @@ -463,9 +464,14 @@ impl Hash for FunctionExpr { #[cfg(feature = "cum_agg")] CumMax { reverse } => reverse.hash(state), #[cfg(feature = "dtype-struct")] - ValueCounts { sort, parallel } => { + ValueCounts { + sort, + parallel, + name, + } => { sort.hash(state); parallel.hash(state); + name.hash(state); }, #[cfg(feature = "unique_counts")] UniqueCounts => {}, @@ -999,7 +1005,11 @@ impl From for SpecialEq> { #[cfg(feature = "cum_agg")] CumMax { reverse } => map!(cum::cum_max, reverse), #[cfg(feature = "dtype-struct")] - ValueCounts { sort, parallel } => map!(dispatch::value_counts, sort, parallel), + ValueCounts { + sort, + parallel, + name, + } => map!(dispatch::value_counts, sort, parallel, name.clone()), #[cfg(feature = "unique_counts")] UniqueCounts => map!(dispatch::unique_counts), Reverse => map!(dispatch::reverse), diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index dadfa5560c65..8106eefa1c2a 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -105,10 +105,15 @@ impl FunctionExpr { #[cfg(feature = "top_k")] TopKBy { .. } => mapper.with_same_dtype(), #[cfg(feature = "dtype-struct")] - ValueCounts { .. } => mapper.map_dtype(|dt| { + ValueCounts { + sort: _, + parallel: _, + name, + } => mapper.map_dtype(|dt| { + let name = name.clone().unwrap_or("count".to_string()); DataType::Struct(vec![ Field::new(fields[0].name().as_str(), dt.clone()), - Field::new("count", IDX_DTYPE), + Field::new(name.as_str(), IDX_DTYPE), ]) }), #[cfg(feature = "unique_counts")] diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 2492f887f6a1..4fa365051ed0 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1714,12 +1714,16 @@ impl Expr { #[cfg(feature = "dtype-struct")] /// Count all unique values and create a struct mapping value to count. /// (Note that it is better to turn parallel off in the aggregation context). - pub fn value_counts(self, sort: bool, parallel: bool) -> Self { - self.apply_private(FunctionExpr::ValueCounts { sort, parallel }) - .with_function_options(|mut opts| { - opts.pass_name_to_apply = true; - opts - }) + pub fn value_counts(self, sort: bool, parallel: bool, name: Option) -> Self { + self.apply_private(FunctionExpr::ValueCounts { + sort, + parallel, + name, + }) + .with_function_options(|mut opts| { + opts.pass_name_to_apply = true; + opts + }) } #[cfg(feature = "unique_counts")] diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 01c08eaf3d7f..9950073e5702 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -17,7 +17,7 @@ fn main() -> Result<(), Box> { let out = ratings .clone() .lazy() - .select([col("Theatre").value_counts(true, true)]) + .select([col("Theatre").value_counts(true, true, None)]) .collect()?; println!("{}", &out); // --8<-- [end:state_value_counts] @@ -26,7 +26,7 @@ fn main() -> Result<(), Box> { let out = ratings .clone() .lazy() - .select([col("Theatre").value_counts(true, true)]) + .select([col("Theatre").value_counts(true, true, None)]) .unnest(["Theatre"]) .collect()?; println!("{}", &out); diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 4505a19ad7f6..c5b08c37e862 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -10909,7 +10909,9 @@ def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Self: return self._from_pyexpr(self._pyexpr.extend_constant(value, n)) @deprecate_renamed_parameter("multithreaded", "parallel", version="0.19.0") - def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self: + def value_counts( + self, *, sort: bool = False, parallel: bool = False, name: str = "count" + ) -> Self: """ Count the occurrences of unique values. @@ -10924,6 +10926,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self: .. note:: This option should likely not be enabled in a group by context, as the computation is already parallelized per group. + name + Give the resulting count field a specific name; defaults to "count". Returns ------- @@ -10948,9 +10952,10 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self: │ {"blue",3} │ └─────────────┘ - Sort the output by count. + Sort the output by (descending) count and customise the count field name. - >>> df.select(pl.col("color").value_counts(sort=True)) + >>> df = df.select(pl.col("color").value_counts(sort=True, name="n")) + >>> df shape: (3, 1) ┌─────────────┐ │ color │ @@ -10961,8 +10966,20 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self: │ {"red",2} │ │ {"green",1} │ └─────────────┘ - """ - return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel)) + + >>> df.unnest("color") + shape: (3, 2) + ┌───────┬─────┐ + │ color ┆ n │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴─────┘ + """ + return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel, name)) def unique_counts(self) -> Self: """ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index fe328d168b87..78dff485e956 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -2762,7 +2762,9 @@ def hist( else: return out.struct.unnest() - def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame: + def value_counts( + self, *, sort: bool = False, parallel: bool = False, name: str = "count" + ) -> DataFrame: """ Count the occurrences of unique values. @@ -2777,6 +2779,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra .. note:: This option should likely not be enabled in a group by context, as the computation is already parallelized per group. + name + Give the resulting count column a specific name; defaults to "count". Returns ------- @@ -2798,22 +2802,22 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra │ blue ┆ 3 │ └───────┴───────┘ - Sort the output by count. + Sort the output by count and customize the count column name. - >>> s.value_counts(sort=True) + >>> s.value_counts(sort=True, name="n") shape: (3, 2) - ┌───────┬───────┐ - │ color ┆ count │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪═══════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴───────┘ + ┌───────┬─────┐ + │ color ┆ n │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴─────┘ """ return pl.DataFrame._from_pydf( - self._s.value_counts(sort=sort, parallel=parallel) + self._s.value_counts(sort=sort, parallel=parallel, name=name) ) def unique_counts(self) -> Series: diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs index 5808a30d9a18..1d2eec345472 100644 --- a/py-polars/src/expr/general.rs +++ b/py-polars/src/expr/general.rs @@ -250,8 +250,8 @@ impl PyExpr { fn len(&self) -> Self { self.inner.clone().len().into() } - fn value_counts(&self, sort: bool, parallel: bool) -> Self { - self.inner.clone().value_counts(sort, parallel).into() + fn value_counts(&self, sort: bool, parallel: bool, name: Option) -> Self { + self.inner.clone().value_counts(sort, parallel, name).into() } fn unique_counts(&self) -> Self { self.inner.clone().unique_counts().into() diff --git a/py-polars/src/lazyframe/visitor/expr_nodes.rs b/py-polars/src/lazyframe/visitor/expr_nodes.rs index 82b0e5281362..cc3ba98490de 100644 --- a/py-polars/src/lazyframe/visitor/expr_nodes.rs +++ b/py-polars/src/lazyframe/visitor/expr_nodes.rs @@ -957,6 +957,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { FunctionExpr::ValueCounts { sort: _, parallel: _, + name: _, } => return Err(PyNotImplementedError::new_err("value counts")), FunctionExpr::UniqueCounts => { return Err(PyNotImplementedError::new_err("unique counts")) diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index 0ceb58eeb9c0..080a50c2af77 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -732,10 +732,15 @@ impl PySeries { self.series.tail(Some(n)).into() } - fn value_counts(&self, sort: bool, parallel: bool) -> PyResult { + fn value_counts( + &self, + sort: bool, + parallel: bool, + name: Option, + ) -> PyResult { let out = self .series - .value_counts(sort, parallel) + .value_counts(sort, parallel, name) .map_err(PyPolarsErr::from)?; Ok(out.into()) } diff --git a/py-polars/tests/unit/operations/test_value_counts.py b/py-polars/tests/unit/operations/test_value_counts.py index b38992b5146a..0c8c82ea73d5 100644 --- a/py-polars/tests/unit/operations/test_value_counts.py +++ b/py-polars/tests/unit/operations/test_value_counts.py @@ -51,13 +51,20 @@ def test_value_counts_expr() -> None: def test_value_counts_duplicate_name() -> None: - s = pl.Series("count", [1]) + s = pl.Series("count", [1, 0, 1]) - with pytest.raises(pl.DuplicateError, match="count"): + # default name is 'count' ... + with pytest.raises( + pl.DuplicateError, + match="duplicate column names; change `name` to fix", + ): s.value_counts() - def test_count() -> None: - assert pl.Series([None, 1, None, 2, 3]).count() == 3 + # ... but can customize that + assert_frame_equal( + pl.DataFrame({"count": [1, 0], "n": [2, 1]}, schema_overrides={"n": pl.UInt32}), + s.value_counts(name="n", sort=True), + ) df = pl.DataFrame({"a": [None, 1, None, 2, 3]}) assert df.select(pl.col("a").count()).item() == 3 @@ -66,3 +73,7 @@ def test_count() -> None: "literal": [1], "a": [3], } + + +def test_count() -> None: + assert pl.Series([None, 1, None, 2, 3]).count() == 3