diff --git a/crates/polars-compute/src/unique/boolean.rs b/crates/polars-compute/src/unique/boolean.rs index 654c2c492c678..4e32c82dad7c2 100644 --- a/crates/polars-compute/src/unique/boolean.rs +++ b/crates/polars-compute/src/unique/boolean.rs @@ -1,86 +1,98 @@ use arrow::array::{Array, BooleanArray}; use arrow::bitmap::MutableBitmap; -use super::UniqueKernel; - -fn bool_unique_fold<'a>( - fst: &'a BooleanArray, - arrs: impl Iterator, -) -> BooleanArray { - // can be None, Some(true), Some(false) - // - // We assign values to each value - // None = 1 - // Some(false) = 2 - // Some(true) = 3 - // - // And keep track of 2 things - // - `found_set`: which values have already appeared - // - `order`: in which order did the values appear +use super::RangedUniqueKernel; - #[inline(always)] - fn append_arr(arr: &BooleanArray, found_set: &mut u32, order: &mut u32) { - for v in arr { - let value = v.map_or(1, |v| 2 + u32::from(v)); - let nulled_value = if *found_set & (1 << value) != 0 { - 0 - } else { - value - }; - - *order |= nulled_value << (found_set.count_ones() * 2); - *found_set |= 1 << value; - - if *found_set == 0b1110 { - break; - } - } +const fn to_value(scalar: Option) -> u8 { + match scalar { + None => 1, + Some(false) => 2, + Some(true) => 3, } +} - let mut found_set = 0u32; - let mut order = 0u32; +impl RangedUniqueKernel for BooleanArray { + type Scalar<'a> = bool; + type Range<'a> = (); - append_arr(fst, &mut found_set, &mut order); - for arr in arrs { - append_arr(arr, &mut found_set, &mut order); + #[inline(always)] + fn to_value<'a>(scalar: Option>, _: Self::Range<'a>) -> u8 { + to_value(scalar) } - let mut values = MutableBitmap::with_capacity(3); - let validity = if found_set & 0b10 != 0 { - let mut validity = MutableBitmap::with_capacity(3); - while order != 0 { - values.push(order & 0b11 > 2); - validity.push(order & 0b11 > 1); - order >>= 2; - } - Some(validity.freeze()) - } else { - while order != 0 { - values.push(order & 0b11 > 2); - order >>= 2; + fn unique_fold<'a>(fst: &'a Self, others: impl Iterator, _: Self::Range<'a>) -> Self { + const ALL_FOUND: u32 = 0b1110; + + // We keep track of 2 things + // - `found_set`: which values have already appeared + // - `order`: in which order did the values appear + + #[inline(always)] + fn append_arr(arr: &BooleanArray, found_set: &mut u32, order: &mut u32) { + if arr.len() == 0 { + return; + } + + let null_count = arr.null_count(); + + if arr.len() == null_count { + *found_set |= 1 << to_value(None); // None + *order = (*order << 2) | u32::from(to_value(None)); + return; + } + + for v in arr { + let value = u32::from(to_value(v)); + let nulled_value = if *found_set & (1 << value) != 0 { + 0 + } else { + value + }; + + *order |= nulled_value << (found_set.count_ones() * 2); + *found_set |= 1 << value; + + if *found_set == ALL_FOUND { + break; + } + } } - None - }; - let values = values.freeze(); + let mut found_set = 0u32; + let mut order = 0u32; - BooleanArray::new(fst.data_type().clone(), values, validity) -} + append_arr(fst, &mut found_set, &mut order); + for arr in others { + append_arr(arr, &mut found_set, &mut order); + } -impl UniqueKernel for BooleanArray { - fn unique_fold<'a>(fst: &'a Self, others: impl Iterator) -> Self { - bool_unique_fold(fst, others) - } + let mut values = MutableBitmap::with_capacity(3); + let validity = if found_set & 0b10 != 0 { + let mut validity = MutableBitmap::with_capacity(3); + while order != 0 { + values.push(order & 0b11 > 2); + validity.push(order & 0b11 > 1); + order >>= 2; + } + Some(validity.freeze()) + } else { + while order != 0 { + values.push(order & 0b11 > 2); + order >>= 2; + } + None + }; + + let values = values.freeze(); - fn unique(&self) -> Self { - Self::unique_fold(self, [].iter()) + BooleanArray::new(fst.data_type().clone(), values, validity) } - fn unique_sorted(&self) -> Self { - Self::unique_fold(self, [].iter()) + fn unique<'a>(&'a self, range: Self::Range<'a>) -> Self { + Self::unique_fold(self, [].iter(), range) } - fn n_unique(&self) -> usize { + fn n_unique<'a>(&'a self, _: Self::Range<'a>) -> usize { if self.len() == 0 { return 0; } @@ -106,8 +118,8 @@ impl UniqueKernel for BooleanArray { } #[inline] - fn n_unique_non_null(&self) -> usize { - self.n_unique() - usize::from(self.null_count() > 0) + fn n_unique_non_null<'a>(&'a self, range: Self::Range<'a>) -> usize { + self.n_unique(range) - usize::from(self.null_count() > 0) } } @@ -122,7 +134,7 @@ fn test_boolean_distinct_count() { >>::map($validity, |v| Bitmap::from_iter(v)); let arr = BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity); - assert_eq!(arr.n_unique(), $dc); + assert_eq!(arr.n_unique(()), $dc); }; } diff --git a/crates/polars-compute/src/unique/mod.rs b/crates/polars-compute/src/unique/mod.rs index 30e6770943d4b..22b8d2d5cb1c4 100644 --- a/crates/polars-compute/src/unique/mod.rs +++ b/crates/polars-compute/src/unique/mod.rs @@ -1,7 +1,7 @@ use arrow::array::Array; -/// Kernel to calculate the number of unique elements -pub trait UniqueKernel: Array { +/// Kernel to calculate the number of unique elements where the elements are already sorted. +pub trait SortedUniqueKernel: Array { /// Calculate the set of unique elements in `fst` and `others` and fold the result into one /// array. fn unique_fold<'a>(fst: &'a Self, others: impl Iterator) -> Self; @@ -10,8 +10,24 @@ pub trait UniqueKernel: Array { /// `self`. fn unique(&self) -> Self; - /// Calculate the set of unique elements in [`Self`] where `self` is sorted. - fn unique_sorted(&self) -> Self; + /// Calculate the number of unique elements in [`Self`] + /// + /// A null is also considered a unique value + fn n_unique(&self) -> usize; + + /// Calculate the number of unique non-null elements in [`Self`] + fn n_unique_non_null(&self) -> usize; +} + +/// Kernel to calculate the number of unique elements +pub trait HashSetUniqueKernel: Array { + /// Calculate the set of unique elements in `fst` and `others` and fold the result into one + /// array. + fn unique_fold<'a>(fst: &'a Self, others: impl Iterator) -> Self; + + /// Calculate the set of unique elements in [`Self`] where we have no further information about + /// `self`. + fn unique(&self) -> Self; /// Calculate the number of unique elements in [`Self`] /// @@ -22,4 +38,34 @@ pub trait UniqueKernel: Array { fn n_unique_non_null(&self) -> usize; } +/// Kernel to calculate the number of unique elements where elements are in a small range of +/// values. +pub trait RangedUniqueKernel: Array { + type Scalar<'a>; + type Range<'a>; + + fn to_value<'a>(scalar: Option>, range: Self::Range<'a>) -> u8; + + /// Calculate the set of unique elements in `fst` and `others` and fold the result into one + /// array. + fn unique_fold<'a>( + fst: &'a Self, + others: impl Iterator, + range: Self::Range<'a>, + ) -> Self; + + /// Calculate the set of unique elements in [`Self`] where we have no further information about + /// `self`. + fn unique<'a>(&'a self, range: Self::Range<'a>) -> Self; + + /// Calculate the number of unique elements in [`Self`] + /// + /// A null is also considered a unique value + fn n_unique<'a>(&'a self, range: Self::Range<'a>) -> usize; + + /// Calculate the number of unique non-null elements in [`Self`] + fn n_unique_non_null<'a>(&'a self, range: Self::Range<'a>) -> usize; +} + mod boolean; +mod primitive; diff --git a/crates/polars-compute/src/unique/primitive.rs b/crates/polars-compute/src/unique/primitive.rs new file mode 100644 index 0000000000000..1b7477dee79d0 --- /dev/null +++ b/crates/polars-compute/src/unique/primitive.rs @@ -0,0 +1,134 @@ +use std::ops::{Add, Sub, RangeInclusive}; + +use arrow::array::PrimitiveArray; +use arrow::bitmap::MutableBitmap; +use arrow::types::NativeType; +use num_traits::FromPrimitive; +// use polars_utils::total_ord::TotalOrd; + +use super::RangedUniqueKernel; + +const SEEN_ALL: u128 = !0; +const SEEN_NONE_MASK: u128 = 1; + +fn append_arr(arr: &PrimitiveArray, seen: &mut u128, range: (RangeInclusive, bool)) +where + T: Add + Sub + FromPrimitive +{ + for v in arr { + *seen |= 1 << >::to_value(v.copied(), range.clone()); + + if *seen == SEEN_ALL { + break; + } + } +} + +impl RangedUniqueKernel for PrimitiveArray +where + T: Add + Sub + FromPrimitive +{ + type Scalar<'a> = T; + type Range<'a> = (RangeInclusive, bool); + + #[inline(always)] + fn to_value<'a>(scalar: Option>, range: Self::Range<'a>) -> u8 { + // debug_assert!({ + // (*range.0.end() - *range.0.start()).to_le_bytes()[0] < 128 + u8::from(range.1) + // }); + // debug_assert!({ + // let mut is_zero = true; + // for b in (*range.0.end() - *range.0.start()).to_le_bytes().as_ref().iter().skip(1) { + // is_zero &= *b == 0; + // } + // is_zero + // }); + // + match scalar { + None => { + debug_assert!(!range.1); + 0 + }, + Some(v) => { + // debug_assert!(::tot_le(&v, range.0.end())); + // debug_assert!(::tot_ge(&v, range.0.start())); + + (v - *range.0.start()).to_le_bytes()[0] + u8::from(range.1) + } + } + } + + fn unique_fold<'a>( + fst: &'a Self, + others: impl Iterator, + range: Self::Range<'a>, + ) -> Self { + let mut seen = 0u128; + + append_arr(fst, &mut seen, range.clone()); + for arr in others { + if seen == SEEN_ALL { + break; + } + + append_arr(arr, &mut seen, range.clone()); + } + + let num_values = seen.count_ones() as usize; + let mut values = Vec::with_capacity(num_values); + + let (values, validity) = if range.1 && seen & SEEN_NONE_MASK != 0 { + let mut validity = MutableBitmap::with_capacity(num_values); + + values.push(T::zeroed()); + validity.push(false); + seen >>= 1; + + let mut offset = 0u8; + while seen != 0 { + let shift = seen.trailing_zeros(); + offset += shift as u8; + values.push(*range.0.start() + T::from_u8(offset).unwrap()); + validity.push(true); + + seen >>= shift + 1; + offset += 1; + } + + (values, Some(validity.freeze())) + } else { + seen >>= u8::from(range.1); + + let mut offset = 0u8; + while seen != 0 { + let shift = seen.trailing_zeros(); + offset += shift as u8; + values.push(*range.0.start() + T::from_u8(offset).unwrap()); + + seen >>= shift + 1; + offset += 1; + } + + (values, None) + }; + + PrimitiveArray::new(fst.data_type().clone(), values.into(), validity) + } + + fn unique<'a>(&'a self, range: Self::Range<'a>) -> Self { + Self::unique_fold(self, [].iter(), range) + } + + fn n_unique<'a>(&'a self, range: Self::Range<'a>) -> usize { + let mut seen = 0u128; + append_arr(self, &mut seen, range.clone()); + seen.count_ones() as usize + } + + fn n_unique_non_null<'a>(&'a self, range: Self::Range<'a>) -> usize { + let mut seen = 0u128; + append_arr(self, &mut seen, range.clone()); + seen &= !SEEN_NONE_MASK; + seen.count_ones() as usize + } +} diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs index af351e02ca736..90f3cda53fda2 100644 --- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -1,6 +1,7 @@ use std::hash::Hash; use arrow::bitmap::MutableBitmap; +use num_traits::FromPrimitive; use polars_utils::total_ord::{ToTotalOrd, TotalHash}; use crate::hashing::_HASHMAP_INIT_SIZE; @@ -120,6 +121,23 @@ where } }, IsSorted::Not => { + let md = self.metadata(); + match (md.get_min_value(), md.get_max_value()) { + (Some(min), Some(max)) => { + if *max - *min <= ::from_u8(127).unwrap() { + let mut iter = self.downcast_iter(); + let fst = iter.next().unwrap(); + let chunk = as polars_compute::unique::RangedUniqueKernel>::unique_fold( + fst, + iter, + (*min..=*max, self.null_count() > 0), + ); + return Ok(Self::with_chunk(self.name(), chunk)); + } + }, + _ => {}, + }; + let sorted = self.sort(false); sorted.unique() }, @@ -244,7 +262,7 @@ impl ChunkUnique for BooleanChunked { )); }; - let unique = polars_compute::unique::UniqueKernel::unique_fold(arr, iter); + let unique = polars_compute::unique::RangedUniqueKernel::unique_fold(arr, iter, ()); Ok(Self::with_chunk(self.name(), unique)) }