Skip to content

Commit

Permalink
Remove Rust implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed May 26, 2024
1 parent 52ba0f8 commit c3c5e70
Show file tree
Hide file tree
Showing 6 changed files with 2 additions and 158 deletions.
135 changes: 0 additions & 135 deletions crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
use arrow::compute::take::take_unchecked;
use arrow::offset::OffsetsBuffer;
use polars_utils::vec::PushUnchecked;

use super::*;

Expand Down Expand Up @@ -233,135 +230,3 @@ impl ChunkExplode for ArrayChunked {
))
}
}

impl ChunkExplode for StringChunked {
fn offsets(&self) -> PolarsResult<OffsetsBuffer<i64>> {
let mut offsets = Vec::with_capacity(self.len() + 1);
let mut length_so_far = 0;
offsets.push(length_so_far);

for arr in self.downcast_iter() {
for len in arr.len_iter() {
// SAFETY:
// pre-allocated
unsafe { offsets.push_unchecked(length_so_far) };
length_so_far += len as i64;
}
}

// SAFETY:
// Monotonically increasing.
unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) }
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
let ca = self.rechunk();
let array = ca.downcast_iter().next().unwrap();
// TODO! maybe optimize for new utf8view?
let array = utf8view_to_utf8(array);

let values = array.values();
let old_offsets = array.offsets().clone();

let (new_offsets, validity) = if let Some(validity) = array.validity() {
// capacity estimate
let capacity = self.get_values_size() + validity.unset_bits();

let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let mut bitmap = MutableBitmap::with_capacity(capacity);
let values = values.as_slice();
for (&offset, valid) in old_offsets[1..].iter().zip(validity) {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };

if valid {
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
// also keep track of the amount of offsets added
// as we must update the validity bitmap
let len_before = new_offsets.len();
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
bitmap.extend_constant(new_offsets.len() - len_before, true);
} else {
// no data, just add old offset and set null bit
new_offsets.push(latest_offset);
bitmap.push(false)
}
old_offset = offset;
}

(new_offsets.into(), bitmap.into())
} else {
// fast(er) explode

// we cannot naively explode, because there might be empty strings.

// capacity estimate
let capacity = self.get_values_size();
let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let values = values.as_slice();
for &offset in &old_offsets[1..] {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
old_offset = offset;
}

(new_offsets.into(), None)
};

let array = unsafe {
Utf8Array::<i64>::from_data_unchecked_default(new_offsets, values.clone(), validity)
};

let new_arr = Box::new(array) as ArrayRef;

let s = Series::try_from((self.name(), new_arr)).unwrap();
Ok((s, old_offsets))
}
}
9 changes: 0 additions & 9 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ pub enum StringFunction {
},
CountMatches(bool),
EndsWith,
Explode,
Extract(usize),
ExtractAll,
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -137,7 +136,6 @@ impl StringFunction {
Contains { .. } => mapper.with_dtype(DataType::Boolean),
CountMatches(_) => mapper.with_dtype(DataType::UInt32),
EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),
Explode => mapper.with_same_dtype(),
Extract(_) => mapper.with_same_dtype(),
ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -208,7 +206,6 @@ impl Display for StringFunction {
ConcatHorizontal { .. } => "concat_horizontal",
#[cfg(feature = "concat_str")]
ConcatVertical { .. } => "concat_vertical",
Explode => "explode",
ExtractAll => "extract_all",
#[cfg(feature = "extract_groups")]
ExtractGroups { .. } => "extract_groups",
Expand Down Expand Up @@ -365,7 +362,6 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Base64Encode => map!(strings::base64_encode),
#[cfg(feature = "binary_encoding")]
Base64Decode(strict) => map!(strings::base64_decode, strict),
Explode => map!(strings::explode),
#[cfg(feature = "dtype-decimal")]
ToDecimal(infer_len) => map!(strings::to_decimal, infer_len),
#[cfg(feature = "extract_jsonpath")]
Expand Down Expand Up @@ -972,11 +968,6 @@ pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult<Series> {
s.str()?.base64_decode(strict).map(|ca| ca.into_series())
}

pub(super) fn explode(s: &Series) -> PolarsResult<Series> {
let ca = s.str()?;
ca.explode()
}

#[cfg(feature = "dtype-decimal")]
pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult<Series> {
let ca = s.str()?;
Expand Down
5 changes: 0 additions & 5 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,6 @@ impl StringNameSpace {
)
}

pub fn explode(self) -> Expr {
self.0
.apply_private(FunctionExpr::StringExpr(StringFunction::Explode))
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_decode(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
self.0
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2393,7 +2393,8 @@ def explode(self) -> Expr:
│ r │
└─────┘
"""
return wrap_expr(self._pyexpr.str_explode())
split = self.split("")
return F.when(split.ne_missing([])).then(split).otherwise([""]).explode()

def to_integer(
self, *, base: int | IntoExprColumn = 10, strict: bool = True
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ impl PyExpr {
self.inner.clone().str().tail(n.inner).into()
}

fn str_explode(&self) -> Self {
self.inner.clone().str().explode().into()
}

fn str_to_uppercase(&self) -> Self {
self.inner.clone().str().to_uppercase().into()
}
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ pub enum PyStringFunction {
Contains,
CountMatches,
EndsWith,
Explode,
Extract,
ExtractAll,
ExtractGroups,
Expand Down Expand Up @@ -675,9 +674,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::EndsWith => {
(PyStringFunction::EndsWith.into_py(py),).to_object(py)
},
StringFunction::Explode => {
(PyStringFunction::Explode.into_py(py),).to_object(py)
},
StringFunction::Extract(_) => {
(PyStringFunction::Extract.into_py(py),).to_object(py)
},
Expand Down

0 comments on commit c3c5e70

Please sign in to comment.