From a1ff7ee8ba14d565cf39ac73aaa08e41d5b1c406 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Sat, 25 May 2024 08:55:35 +0200 Subject: [PATCH] perf: improved numeric fill_(forward/backward) (#16475) --- crates/polars-arrow/src/legacy/utils.rs | 30 ++++---- crates/polars-arrow/src/trusted_len.rs | 2 +- .../src/chunked_array/ops/fill_null.rs | 70 ++++++++++++++++++- py-polars/tests/unit/dataframe/test_df.py | 18 ++++- 4 files changed, 103 insertions(+), 17 deletions(-) diff --git a/crates/polars-arrow/src/legacy/utils.rs b/crates/polars-arrow/src/legacy/utils.rs index 5670fed471ce..a9a6b7d0ed78 100644 --- a/crates/polars-arrow/src/legacy/utils.rs +++ b/crates/polars-arrow/src/legacy/utils.rs @@ -113,21 +113,27 @@ impl FromTrustedLenIterator for PrimitiveArray { } } -impl FromIteratorReversed for PrimitiveArray { +impl FromIteratorReversed for Vec { fn from_trusted_len_iter_rev>(iter: I) -> Self { - let size = iter.size_hint().1.unwrap(); - - let mut vals: Vec = Vec::with_capacity(size); unsafe { - // Set to end of buffer. - let mut ptr = vals.as_mut_ptr().add(size); - - iter.for_each(|item| { - ptr = ptr.sub(1); - std::ptr::write(ptr, item); - }); - vals.set_len(size) + let len = iter.size_hint().1.unwrap(); + let mut out: Vec = Vec::with_capacity(len); + let mut idx = len; + for x in iter { + debug_assert!(idx > 0); + idx -= 1; + out.as_mut_ptr().add(idx).write(x); + } + debug_assert!(idx == 0); + out.set_len(len); + out } + } +} + +impl FromIteratorReversed for PrimitiveArray { + fn from_trusted_len_iter_rev>(iter: I) -> Self { + let vals: Vec = iter.collect_reversed(); PrimitiveArray::new(ArrowDataType::from(T::PRIMITIVE), vals.into(), None) } } diff --git a/crates/polars-arrow/src/trusted_len.rs b/crates/polars-arrow/src/trusted_len.rs index 4bdce32e4990..3237ba83cbb2 100644 --- a/crates/polars-arrow/src/trusted_len.rs +++ b/crates/polars-arrow/src/trusted_len.rs @@ -71,7 +71,7 @@ unsafe impl TrustedLen for std::iter::StepBy {} unsafe impl TrustedLen for Scan where F: FnMut(&mut St, I::Item) -> Option, - I: TrustedLen + Iterator, + I: TrustedLen, { } diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index e52cef3c296e..f09852f94f5e 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -1,6 +1,8 @@ +use arrow::bitmap::MutableBitmap; use arrow::legacy::kernels::set::set_at_nulls; use arrow::legacy::trusted_len::FromIteratorReversed; use arrow::legacy::utils::FromTrustedLenIterator; +use bytemuck::Zeroable; use num_traits::{Bounded, NumCast, One, Zero}; use crate::prelude::*; @@ -280,6 +282,70 @@ macro_rules! impl_fill_backward_limit { }}; } +fn fill_forward_numeric<'a, T, I>(ca: &'a ChunkedArray) -> ChunkedArray +where + T: PolarsDataType, + &'a ChunkedArray: IntoIterator, + I: TrustedLen + Iterator>>, + T::ZeroablePhysical<'a>: LocalCopy, +{ + // Compute values. + let values: Vec> = ca + .into_iter() + .scan(T::ZeroablePhysical::zeroed(), |prev, v| { + *prev = v.map(|v| v.into()).unwrap_or(prev.cheap_clone()); + Some(prev.cheap_clone()) + }) + .collect_trusted(); + + // Compute bitmask. + let num_start_nulls = ca.first_non_null().unwrap_or(ca.len()); + let mut bm = MutableBitmap::with_capacity(ca.len()); + bm.extend_constant(num_start_nulls, false); + bm.extend_constant(ca.len() - num_start_nulls, true); + ChunkedArray::from_chunk_iter_like( + ca, + [ + T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(true)) + .with_validity_typed(Some(bm.into())), + ], + ) +} + +fn fill_backward_numeric<'a, T, I>(ca: &'a ChunkedArray) -> ChunkedArray +where + T: PolarsDataType, + &'a ChunkedArray: IntoIterator, + I: TrustedLen + Iterator>> + DoubleEndedIterator, + T::ZeroablePhysical<'a>: LocalCopy, +{ + // Compute values. + let values: Vec> = ca + .into_iter() + .rev() + .scan(T::ZeroablePhysical::zeroed(), |prev, v| { + *prev = v.map(|v| v.into()).unwrap_or(prev.cheap_clone()); + Some(prev.cheap_clone()) + }) + .collect_reversed(); + + // Compute bitmask. + let num_end_nulls = ca + .last_non_null() + .map(|i| ca.len() - 1 - i) + .unwrap_or(ca.len()); + let mut bm = MutableBitmap::with_capacity(ca.len()); + bm.extend_constant(ca.len() - num_end_nulls, true); + bm.extend_constant(num_end_nulls, false); + ChunkedArray::from_chunk_iter_like( + ca, + [ + T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(true)) + .with_validity_typed(Some(bm.into())), + ], + ) +} + fn fill_null_numeric( ca: &ChunkedArray, strategy: FillNullStrategy, @@ -293,9 +359,9 @@ where return Ok(ca.clone()); } let mut out = match strategy { - FillNullStrategy::Forward(None) => fill_forward(ca), + FillNullStrategy::Forward(None) => fill_forward_numeric(ca), FillNullStrategy::Forward(Some(limit)) => fill_forward_limit(ca, limit), - FillNullStrategy::Backward(None) => fill_backward(ca), + FillNullStrategy::Backward(None) => fill_backward_numeric(ca), FillNullStrategy::Backward(Some(limit)) => fill_backward_limit(ca, limit), FillNullStrategy::Min => { ca.fill_null_with_values(ChunkAgg::min(ca).ok_or_else(err_fill_null)?)? diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 02377153c863..ce36e51a45e2 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1866,10 +1866,24 @@ def test_fill_nan() -> None: assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime] +def test_forward_fill() -> None: + df = pl.DataFrame({"a": [1.0, None, 3.0]}) + fill = df.select(pl.col("a").forward_fill())["a"] + assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64)) + + df = pl.DataFrame({"a": [None, 1, None]}) + fill = df.select(pl.col("a").forward_fill())["a"] + assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64)) + + def test_backward_fill() -> None: df = pl.DataFrame({"a": [1.0, None, 3.0]}) - col_a_backward_fill = df.select([pl.col("a").backward_fill()])["a"] - assert_series_equal(col_a_backward_fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64)) + fill = df.select(pl.col("a").backward_fill())["a"] + assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64)) + + df = pl.DataFrame({"a": [None, 1, None]}) + fill = df.select(pl.col("a").backward_fill())["a"] + assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64)) def test_shrink_to_fit() -> None: