From c3c5e70939257c8805c6c72ec09fcefc26607143 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 27 May 2024 00:33:57 +0200 Subject: [PATCH] Remove Rust implementation --- .../chunked_array/ops/explode_and_offsets.rs | 135 ------------------ .../src/dsl/function_expr/strings.rs | 9 -- crates/polars-plan/src/dsl/string.rs | 5 - py-polars/polars/expr/string.py | 3 +- py-polars/src/expr/string.rs | 4 - py-polars/src/lazyframe/visitor/expr_nodes.rs | 4 - 6 files changed, 2 insertions(+), 158 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 0f59c80d4651..7c08d4de622a 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,8 +1,5 @@ -use arrow::bitmap::MutableBitmap; -use arrow::compute::cast::utf8view_to_utf8; use arrow::compute::take::take_unchecked; use arrow::offset::OffsetsBuffer; -use polars_utils::vec::PushUnchecked; use super::*; @@ -233,135 +230,3 @@ impl ChunkExplode for ArrayChunked { )) } } - -impl ChunkExplode for StringChunked { - fn offsets(&self) -> PolarsResult> { - let mut offsets = Vec::with_capacity(self.len() + 1); - let mut length_so_far = 0; - offsets.push(length_so_far); - - for arr in self.downcast_iter() { - for len in arr.len_iter() { - // SAFETY: - // pre-allocated - unsafe { offsets.push_unchecked(length_so_far) }; - length_so_far += len as i64; - } - } - - // SAFETY: - // Monotonically increasing. - unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) } - } - - fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { - // A list array's memory layout is actually already 'exploded', so we can just take the values array - // of the list. And we also return a slice of the offsets. This slice can be used to find the old - // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation - let ca = self.rechunk(); - let array = ca.downcast_iter().next().unwrap(); - // TODO! maybe optimize for new utf8view? - let array = utf8view_to_utf8(array); - - let values = array.values(); - let old_offsets = array.offsets().clone(); - - let (new_offsets, validity) = if let Some(validity) = array.validity() { - // capacity estimate - let capacity = self.get_values_size() + validity.unset_bits(); - - let old_offsets = old_offsets.as_slice(); - let mut old_offset = old_offsets[0]; - let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(old_offset); - - let mut bitmap = MutableBitmap::with_capacity(capacity); - let values = values.as_slice(); - for (&offset, valid) in old_offsets[1..].iter().zip(validity) { - // SAFETY: - // new_offsets already has a single value, so -1 is always in bounds - let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; - - if valid { - debug_assert!(old_offset as usize <= values.len()); - debug_assert!(offset as usize <= values.len()); - let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) }; - - // take the string value and find the char offsets - // create a new offset value for each char boundary - // SAFETY: - // we know we have string data. - let str_val = unsafe { std::str::from_utf8_unchecked(val) }; - - let char_offsets = str_val - .char_indices() - .skip(1) - .map(|t| t.0 as i64 + latest_offset); - - // extend the chars - // also keep track of the amount of offsets added - // as we must update the validity bitmap - let len_before = new_offsets.len(); - new_offsets.extend(char_offsets); - new_offsets.push(latest_offset + str_val.len() as i64); - bitmap.extend_constant(new_offsets.len() - len_before, true); - } else { - // no data, just add old offset and set null bit - new_offsets.push(latest_offset); - bitmap.push(false) - } - old_offset = offset; - } - - (new_offsets.into(), bitmap.into()) - } else { - // fast(er) explode - - // we cannot naively explode, because there might be empty strings. - - // capacity estimate - let capacity = self.get_values_size(); - let old_offsets = old_offsets.as_slice(); - let mut old_offset = old_offsets[0]; - let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(old_offset); - - let values = values.as_slice(); - for &offset in &old_offsets[1..] { - // SAFETY: - // new_offsets already has a single value, so -1 is always in bounds - let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; - debug_assert!(old_offset as usize <= values.len()); - debug_assert!(offset as usize <= values.len()); - let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) }; - - // take the string value and find the char offsets - // create a new offset value for each char boundary - // SAFETY: - // we know we have string data. - let str_val = unsafe { std::str::from_utf8_unchecked(val) }; - - let char_offsets = str_val - .char_indices() - .skip(1) - .map(|t| t.0 as i64 + latest_offset); - - // extend the chars - new_offsets.extend(char_offsets); - new_offsets.push(latest_offset + str_val.len() as i64); - old_offset = offset; - } - - (new_offsets.into(), None) - }; - - let array = unsafe { - Utf8Array::::from_data_unchecked_default(new_offsets, values.clone(), validity) - }; - - let new_arr = Box::new(array) as ArrayRef; - - let s = Series::try_from((self.name(), new_arr)).unwrap(); - Ok((s, old_offsets)) - } -} diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 4c932f8a131f..2f2f80e2e6d5 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -39,7 +39,6 @@ pub enum StringFunction { }, CountMatches(bool), EndsWith, - Explode, Extract(usize), ExtractAll, #[cfg(feature = "extract_groups")] @@ -137,7 +136,6 @@ impl StringFunction { Contains { .. } => mapper.with_dtype(DataType::Boolean), CountMatches(_) => mapper.with_dtype(DataType::UInt32), EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean), - Explode => mapper.with_same_dtype(), Extract(_) => mapper.with_same_dtype(), ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))), #[cfg(feature = "extract_groups")] @@ -208,7 +206,6 @@ impl Display for StringFunction { ConcatHorizontal { .. } => "concat_horizontal", #[cfg(feature = "concat_str")] ConcatVertical { .. } => "concat_vertical", - Explode => "explode", ExtractAll => "extract_all", #[cfg(feature = "extract_groups")] ExtractGroups { .. } => "extract_groups", @@ -365,7 +362,6 @@ impl From for SpecialEq> { Base64Encode => map!(strings::base64_encode), #[cfg(feature = "binary_encoding")] Base64Decode(strict) => map!(strings::base64_decode, strict), - Explode => map!(strings::explode), #[cfg(feature = "dtype-decimal")] ToDecimal(infer_len) => map!(strings::to_decimal, infer_len), #[cfg(feature = "extract_jsonpath")] @@ -972,11 +968,6 @@ pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult { s.str()?.base64_decode(strict).map(|ca| ca.into_series()) } -pub(super) fn explode(s: &Series) -> PolarsResult { - let ca = s.str()?; - ca.explode() -} - #[cfg(feature = "dtype-decimal")] pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult { let ca = s.str()?; diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index e5aa3fc58119..29f278a52a5d 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -547,11 +547,6 @@ impl StringNameSpace { ) } - pub fn explode(self) -> Expr { - self.0 - .apply_private(FunctionExpr::StringExpr(StringFunction::Explode)) - } - #[cfg(feature = "extract_jsonpath")] pub fn json_decode(self, dtype: Option, infer_schema_len: Option) -> Expr { self.0 diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 7a083e2ebe0e..088071bd8987 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -2393,7 +2393,8 @@ def explode(self) -> Expr: │ r │ └─────┘ """ - return wrap_expr(self._pyexpr.str_explode()) + split = self.split("") + return F.when(split.ne_missing([])).then(split).otherwise([""]).explode() def to_integer( self, *, base: int | IntoExprColumn = 10, strict: bool = True diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index eb623c0dce8e..4903413d604c 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -110,10 +110,6 @@ impl PyExpr { self.inner.clone().str().tail(n.inner).into() } - fn str_explode(&self) -> Self { - self.inner.clone().str().explode().into() - } - fn str_to_uppercase(&self) -> Self { self.inner.clone().str().to_uppercase().into() } diff --git a/py-polars/src/lazyframe/visitor/expr_nodes.rs b/py-polars/src/lazyframe/visitor/expr_nodes.rs index 2b2d9e087c55..4b911720efb3 100644 --- a/py-polars/src/lazyframe/visitor/expr_nodes.rs +++ b/py-polars/src/lazyframe/visitor/expr_nodes.rs @@ -116,7 +116,6 @@ pub enum PyStringFunction { Contains, CountMatches, EndsWith, - Explode, Extract, ExtractAll, ExtractGroups, @@ -675,9 +674,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { StringFunction::EndsWith => { (PyStringFunction::EndsWith.into_py(py),).to_object(py) }, - StringFunction::Explode => { - (PyStringFunction::Explode.into_py(py),).to_object(py) - }, StringFunction::Extract(_) => { (PyStringFunction::Extract.into_py(py),).to_object(py) },