Skip to content

Commit

Permalink
depr(python,rust!): Deprecate str.explode in favor of `str.split(""…
Browse files Browse the repository at this point in the history
…).explode()` (#16508)
  • Loading branch information
stinodego authored May 28, 2024
1 parent 27c6dcd commit 7cfa80a
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 249 deletions.
135 changes: 0 additions & 135 deletions crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
use arrow::compute::take::take_unchecked;
use arrow::offset::OffsetsBuffer;
use polars_utils::vec::PushUnchecked;

use super::*;

Expand Down Expand Up @@ -233,135 +230,3 @@ impl ChunkExplode for ArrayChunked {
))
}
}

impl ChunkExplode for StringChunked {
fn offsets(&self) -> PolarsResult<OffsetsBuffer<i64>> {
let mut offsets = Vec::with_capacity(self.len() + 1);
let mut length_so_far = 0;
offsets.push(length_so_far);

for arr in self.downcast_iter() {
for len in arr.len_iter() {
// SAFETY:
// pre-allocated
unsafe { offsets.push_unchecked(length_so_far) };
length_so_far += len as i64;
}
}

// SAFETY:
// Monotonically increasing.
unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) }
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
let ca = self.rechunk();
let array = ca.downcast_iter().next().unwrap();
// TODO! maybe optimize for new utf8view?
let array = utf8view_to_utf8(array);

let values = array.values();
let old_offsets = array.offsets().clone();

let (new_offsets, validity) = if let Some(validity) = array.validity() {
// capacity estimate
let capacity = self.get_values_size() + validity.unset_bits();

let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let mut bitmap = MutableBitmap::with_capacity(capacity);
let values = values.as_slice();
for (&offset, valid) in old_offsets[1..].iter().zip(validity) {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };

if valid {
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
// also keep track of the amount of offsets added
// as we must update the validity bitmap
let len_before = new_offsets.len();
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
bitmap.extend_constant(new_offsets.len() - len_before, true);
} else {
// no data, just add old offset and set null bit
new_offsets.push(latest_offset);
bitmap.push(false)
}
old_offset = offset;
}

(new_offsets.into(), bitmap.into())
} else {
// fast(er) explode

// we cannot naively explode, because there might be empty strings.

// capacity estimate
let capacity = self.get_values_size();
let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let values = values.as_slice();
for &offset in &old_offsets[1..] {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
old_offset = offset;
}

(new_offsets.into(), None)
};

let array = unsafe {
Utf8Array::<i64>::from_data_unchecked_default(new_offsets, values.clone(), validity)
};

let new_arr = Box::new(array) as ArrayRef;

let s = Series::try_from((self.name(), new_arr)).unwrap();
Ok((s, old_offsets))
}
}
9 changes: 0 additions & 9 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ pub enum StringFunction {
},
CountMatches(bool),
EndsWith,
Explode,
Extract(usize),
ExtractAll,
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -137,7 +136,6 @@ impl StringFunction {
Contains { .. } => mapper.with_dtype(DataType::Boolean),
CountMatches(_) => mapper.with_dtype(DataType::UInt32),
EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),
Explode => mapper.with_same_dtype(),
Extract(_) => mapper.with_same_dtype(),
ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -208,7 +206,6 @@ impl Display for StringFunction {
ConcatHorizontal { .. } => "concat_horizontal",
#[cfg(feature = "concat_str")]
ConcatVertical { .. } => "concat_vertical",
Explode => "explode",
ExtractAll => "extract_all",
#[cfg(feature = "extract_groups")]
ExtractGroups { .. } => "extract_groups",
Expand Down Expand Up @@ -365,7 +362,6 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Base64Encode => map!(strings::base64_encode),
#[cfg(feature = "binary_encoding")]
Base64Decode(strict) => map!(strings::base64_decode, strict),
Explode => map!(strings::explode),
#[cfg(feature = "dtype-decimal")]
ToDecimal(infer_len) => map!(strings::to_decimal, infer_len),
#[cfg(feature = "extract_jsonpath")]
Expand Down Expand Up @@ -972,11 +968,6 @@ pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult<Series> {
s.str()?.base64_decode(strict).map(|ca| ca.into_series())
}

pub(super) fn explode(s: &Series) -> PolarsResult<Series> {
let ca = s.str()?;
ca.explode()
}

#[cfg(feature = "dtype-decimal")]
pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult<Series> {
let ca = s.str()?;
Expand Down
5 changes: 0 additions & 5 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,6 @@ impl StringNameSpace {
)
}

pub fn explode(self) -> Expr {
self.0
.apply_private(FunctionExpr::StringExpr(StringFunction::Explode))
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_decode(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
self.0
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4952,7 +4952,6 @@ def explode(self) -> Self:
See Also
--------
Expr.list.explode : Explode a list column.
Expr.str.explode : Explode a string column.
Examples
--------
Expand Down
19 changes: 17 additions & 2 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import polars._reexport as pl
from polars import functions as F
from polars._utils.deprecation import (
deprecate_function,
deprecate_renamed_function,
deprecate_renamed_parameter,
issue_deprecation_warning,
Expand Down Expand Up @@ -2352,10 +2353,23 @@ def tail(self, n: int | IntoExprColumn) -> Expr:
n = parse_as_expression(n)
return wrap_expr(self._pyexpr.str_tail(n))

@deprecate_function(
'Use `.str.split("").explode()` instead.'
" Note that empty strings will result in null instead of being preserved."
" To get the exact same behavior, split first and then use when/then/otherwise"
" to handle the empty list before exploding.",
version="0.20.31",
)
def explode(self) -> Expr:
"""
Returns a column with a separate row for every string character.
.. deprecated:: 0.20.31
Use `.str.split("").explode()` instead.
Note that empty strings will result in null instead of being preserved.
To get the exact same behavior, split first and then use when/then/otherwise
to handle the empty list before exploding.
Returns
-------
Expr
Expand All @@ -2364,7 +2378,7 @@ def explode(self) -> Expr:
Examples
--------
>>> df = pl.DataFrame({"a": ["foo", "bar"]})
>>> df.select(pl.col("a").str.explode())
>>> df.select(pl.col("a").str.explode()) # doctest: +SKIP
shape: (6, 1)
┌─────┐
│ a │
Expand All @@ -2379,7 +2393,8 @@ def explode(self) -> Expr:
│ r │
└─────┘
"""
return wrap_expr(self._pyexpr.str_explode())
split = self.split("")
return F.when(split.ne_missing([])).then(split).otherwise([""]).explode()

def to_integer(
self, *, base: int | IntoExprColumn = 10, strict: bool = True
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4080,7 +4080,6 @@ def explode(self) -> Series:
See Also
--------
Series.list.explode : Explode a list column.
Series.str.explode : Explode a string column.
Examples
--------
Expand Down
16 changes: 15 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from polars._utils.deprecation import (
deprecate_function,
deprecate_renamed_function,
deprecate_renamed_parameter,
)
Expand Down Expand Up @@ -1776,10 +1777,23 @@ def tail(self, n: int | IntoExprColumn) -> Series:
]
"""

@deprecate_function(
'Use `.str.split("").explode()` instead.'
" Note that empty strings will result in null instead of being preserved."
" To get the exact same behavior, split first and then use when/then/otherwise"
" to handle the empty list before exploding.",
version="0.20.31",
)
def explode(self) -> Series:
"""
Returns a column with a separate row for every string character.
.. deprecated:: 0.20.31
Use `.str.split("").explode()` instead.
Note that empty strings will result in null instead of being preserved.
To get the exact same behavior, split first and then use when/then/otherwise
to handle the empty list before exploding.
Returns
-------
Series
Expand All @@ -1788,7 +1802,7 @@ def explode(self) -> Series:
Examples
--------
>>> s = pl.Series("a", ["foo", "bar"])
>>> s.str.explode()
>>> s.str.explode() # doctest: +SKIP
shape: (6,)
Series: 'a' [str]
[
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ impl PyExpr {
self.inner.clone().str().tail(n.inner).into()
}

fn str_explode(&self) -> Self {
self.inner.clone().str().explode().into()
}

fn str_to_uppercase(&self) -> Self {
self.inner.clone().str().to_uppercase().into()
}
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ pub enum PyStringFunction {
Contains,
CountMatches,
EndsWith,
Explode,
Extract,
ExtractAll,
ExtractGroups,
Expand Down Expand Up @@ -670,9 +669,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::EndsWith => {
(PyStringFunction::EndsWith.into_py(py),).to_object(py)
},
StringFunction::Explode => {
(PyStringFunction::Explode.into_py(py),).to_object(py)
},
StringFunction::Extract(_) => {
(PyStringFunction::Extract.into_py(py),).to_object(py)
},
Expand Down
Loading

0 comments on commit 7cfa80a

Please sign in to comment.