Skip to content

Commit

Permalink
feat: Allow designation of a custom name for the value_counts "coun…
Browse files Browse the repository at this point in the history
…t" column
  • Loading branch information
alexander-beedie committed May 23, 2024
1 parent 30a5534 commit fb12674
Show file tree
Hide file tree
Showing 12 changed files with 113 additions and 45 deletions.
16 changes: 11 additions & 5 deletions crates/polars-ops/src/series/ops/various.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,27 @@ use crate::series::ops::SeriesSealed;
pub trait SeriesMethods: SeriesSealed {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
fn value_counts(&self, sort: bool, parallel: bool) -> PolarsResult<DataFrame> {
fn value_counts(
&self,
sort: bool,
parallel: bool,
name: Option<String>,
) -> PolarsResult<DataFrame> {
let name = name.unwrap_or("count".to_string());
let s = self.as_series();
polars_ensure!(
s.name() != "count",
Duplicate: "using `value_counts` on a column named 'count' would lead to duplicate column names"
s.name() != name,
Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate column names; change `name` to fix", name,
);
// we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
let groups = s.group_tuples(parallel, sort)?;
let values = unsafe { s.agg_first(&groups) };
let counts = groups.group_count().with_name("count");
let counts = groups.group_count().with_name(name.as_str());
let cols = vec![values, counts.into_series()];
let df = unsafe { DataFrame::new_no_checks(cols) };
if sort {
df.sort(
["count"],
[name],
SortMultipleOptions::default()
.with_order_descending(true)
.with_multithreaded(parallel),
Expand Down
9 changes: 7 additions & 2 deletions crates/polars-plan/src/dsl/function_expr/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,13 @@ pub(super) fn replace_time_zone(
}

#[cfg(feature = "dtype-struct")]
pub(super) fn value_counts(s: &Series, sort: bool, parallel: bool) -> PolarsResult<Series> {
s.value_counts(sort, parallel)
pub(super) fn value_counts(
s: &Series,
sort: bool,
parallel: bool,
name: Option<String>,
) -> PolarsResult<Series> {
s.value_counts(sort, parallel, name)
.map(|df| df.into_struct(s.name()).into_series())
}

Expand Down
14 changes: 12 additions & 2 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ pub enum FunctionExpr {
ValueCounts {
sort: bool,
parallel: bool,
name: Option<String>,
},
#[cfg(feature = "unique_counts")]
UniqueCounts,
Expand Down Expand Up @@ -463,9 +464,14 @@ impl Hash for FunctionExpr {
#[cfg(feature = "cum_agg")]
CumMax { reverse } => reverse.hash(state),
#[cfg(feature = "dtype-struct")]
ValueCounts { sort, parallel } => {
ValueCounts {
sort,
parallel,
name,
} => {
sort.hash(state);
parallel.hash(state);
name.hash(state);
},
#[cfg(feature = "unique_counts")]
UniqueCounts => {},
Expand Down Expand Up @@ -999,7 +1005,11 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn SeriesUdf>> {
#[cfg(feature = "cum_agg")]
CumMax { reverse } => map!(cum::cum_max, reverse),
#[cfg(feature = "dtype-struct")]
ValueCounts { sort, parallel } => map!(dispatch::value_counts, sort, parallel),
ValueCounts {
sort,
parallel,
name,
} => map!(dispatch::value_counts, sort, parallel, name.clone()),
#[cfg(feature = "unique_counts")]
UniqueCounts => map!(dispatch::unique_counts),
Reverse => map!(dispatch::reverse),
Expand Down
9 changes: 7 additions & 2 deletions crates/polars-plan/src/dsl/function_expr/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,15 @@ impl FunctionExpr {
#[cfg(feature = "top_k")]
TopKBy { .. } => mapper.with_same_dtype(),
#[cfg(feature = "dtype-struct")]
ValueCounts { .. } => mapper.map_dtype(|dt| {
ValueCounts {
sort: _,
parallel: _,
name,
} => mapper.map_dtype(|dt| {
let name = name.clone().unwrap_or("count".to_string());
DataType::Struct(vec![
Field::new(fields[0].name().as_str(), dt.clone()),
Field::new("count", IDX_DTYPE),
Field::new(name.as_str(), IDX_DTYPE),
])
}),
#[cfg(feature = "unique_counts")]
Expand Down
16 changes: 10 additions & 6 deletions crates/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1714,12 +1714,16 @@ impl Expr {
#[cfg(feature = "dtype-struct")]
/// Count all unique values and create a struct mapping value to count.
/// (Note that it is better to turn parallel off in the aggregation context).
pub fn value_counts(self, sort: bool, parallel: bool) -> Self {
self.apply_private(FunctionExpr::ValueCounts { sort, parallel })
.with_function_options(|mut opts| {
opts.pass_name_to_apply = true;
opts
})
pub fn value_counts(self, sort: bool, parallel: bool, name: Option<String>) -> Self {
self.apply_private(FunctionExpr::ValueCounts {
sort,
parallel,
name,
})
.with_function_options(|mut opts| {
opts.pass_name_to_apply = true;
opts
})
}

#[cfg(feature = "unique_counts")]
Expand Down
4 changes: 2 additions & 2 deletions docs/src/rust/user-guide/expressions/structs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let out = ratings
.clone()
.lazy()
.select([col("Theatre").value_counts(true, true)])
.select([col("Theatre").value_counts(true, true, None)])
.collect()?;
println!("{}", &out);
// --8<-- [end:state_value_counts]
Expand All @@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let out = ratings
.clone()
.lazy()
.select([col("Theatre").value_counts(true, true)])
.select([col("Theatre").value_counts(true, true, None)])
.unnest(["Theatre"])
.collect()?;
println!("{}", &out);
Expand Down
27 changes: 22 additions & 5 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10909,7 +10909,9 @@ def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Self:
return self._from_pyexpr(self._pyexpr.extend_constant(value, n))

@deprecate_renamed_parameter("multithreaded", "parallel", version="0.19.0")
def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
def value_counts(
self, *, sort: bool = False, parallel: bool = False, name: str = "count"
) -> Self:
"""
Count the occurrences of unique values.
Expand All @@ -10924,6 +10926,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
.. note::
This option should likely not be enabled in a group by context,
as the computation is already parallelized per group.
name
Give the resulting count field a specific name; defaults to "count".
Returns
-------
Expand All @@ -10948,9 +10952,10 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
│ {"blue",3} │
└─────────────┘
Sort the output by count.
Sort the output by (descending) count and customise the count field name.
>>> df.select(pl.col("color").value_counts(sort=True))
>>> df = df.select(pl.col("color").value_counts(sort=True, name="n"))
>>> df
shape: (3, 1)
┌─────────────┐
│ color │
Expand All @@ -10961,8 +10966,20 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
│ {"red",2} │
│ {"green",1} │
└─────────────┘
"""
return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel))
>>> df.unnest("color")
shape: (3, 2)
┌───────┬─────┐
│ color ┆ n │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪═════╡
│ blue ┆ 3 │
│ red ┆ 2 │
│ green ┆ 1 │
└───────┴─────┘
"""
return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel, name))

def unique_counts(self) -> Self:
"""
Expand Down
30 changes: 17 additions & 13 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2762,7 +2762,9 @@ def hist(
else:
return out.struct.unnest()

def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame:
def value_counts(
self, *, sort: bool = False, parallel: bool = False, name: str = "count"
) -> DataFrame:
"""
Count the occurrences of unique values.
Expand All @@ -2777,6 +2779,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
.. note::
This option should likely not be enabled in a group by context,
as the computation is already parallelized per group.
name
Give the resulting count column a specific name; defaults to "count".
Returns
-------
Expand All @@ -2798,22 +2802,22 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
│ blue ┆ 3 │
└───────┴───────┘
Sort the output by count.
Sort the output by count and customize the count column name.
>>> s.value_counts(sort=True)
>>> s.value_counts(sort=True, name="n")
shape: (3, 2)
┌───────┬───────
│ color ┆ count
│ --- ┆ ---
│ str ┆ u32
╞═══════╪═══════
│ blue ┆ 3
│ red ┆ 2
│ green ┆ 1
└───────┴───────
┌───────┬─────┐
│ color ┆ n
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪═════╡
│ blue ┆ 3 │
│ red ┆ 2 │
│ green ┆ 1 │
└───────┴─────┘
"""
return pl.DataFrame._from_pydf(
self._s.value_counts(sort=sort, parallel=parallel)
self._s.value_counts(sort=sort, parallel=parallel, name=name)
)

def unique_counts(self) -> Series:
Expand Down
4 changes: 2 additions & 2 deletions py-polars/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ impl PyExpr {
fn len(&self) -> Self {
self.inner.clone().len().into()
}
fn value_counts(&self, sort: bool, parallel: bool) -> Self {
self.inner.clone().value_counts(sort, parallel).into()
fn value_counts(&self, sort: bool, parallel: bool, name: Option<String>) -> Self {
self.inner.clone().value_counts(sort, parallel, name).into()
}
fn unique_counts(&self) -> Self {
self.inner.clone().unique_counts().into()
Expand Down
1 change: 1 addition & 0 deletions py-polars/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
FunctionExpr::ValueCounts {
sort: _,
parallel: _,
name: _,
} => return Err(PyNotImplementedError::new_err("value counts")),
FunctionExpr::UniqueCounts => {
return Err(PyNotImplementedError::new_err("unique counts"))
Expand Down
9 changes: 7 additions & 2 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,10 +732,15 @@ impl PySeries {
self.series.tail(Some(n)).into()
}

fn value_counts(&self, sort: bool, parallel: bool) -> PyResult<PyDataFrame> {
fn value_counts(
&self,
sort: bool,
parallel: bool,
name: Option<String>,
) -> PyResult<PyDataFrame> {
let out = self
.series
.value_counts(sort, parallel)
.value_counts(sort, parallel, name)
.map_err(PyPolarsErr::from)?;
Ok(out.into())
}
Expand Down
19 changes: 15 additions & 4 deletions py-polars/tests/unit/operations/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,20 @@ def test_value_counts_expr() -> None:


def test_value_counts_duplicate_name() -> None:
s = pl.Series("count", [1])
s = pl.Series("count", [1, 0, 1])

with pytest.raises(pl.DuplicateError, match="count"):
# default name is 'count' ...
with pytest.raises(
pl.DuplicateError,
match="duplicate column names; change `name` to fix",
):
s.value_counts()

def test_count() -> None:
assert pl.Series([None, 1, None, 2, 3]).count() == 3
# ... but can customize that
assert_frame_equal(
pl.DataFrame({"count": [1, 0], "n": [2, 1]}, schema_overrides={"n": pl.UInt32}),
s.value_counts(name="n", sort=True),
)

df = pl.DataFrame({"a": [None, 1, None, 2, 3]})
assert df.select(pl.col("a").count()).item() == 3
Expand All @@ -66,3 +73,7 @@ def test_count() -> None:
"literal": [1],
"a": [3],
}


def test_count() -> None:
assert pl.Series([None, 1, None, 2, 3]).count() == 3

0 comments on commit fb12674

Please sign in to comment.