Skip to content

Commit

Permalink
feat(rust): Add split_at method to arrow Array (#16620)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Jun 1, 2024
1 parent 5974ac7 commit ae70fd4
Show file tree
Hide file tree
Showing 31 changed files with 902 additions and 53 deletions.
29 changes: 28 additions & 1 deletion crates/polars-arrow/src/array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use either::Either;

use super::specification::try_check_offsets_bounds;
use super::{Array, GenericBinaryArray};
use super::{Array, GenericBinaryArray, Splitable};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
use crate::bitmap::Bitmap;
Expand Down Expand Up @@ -450,3 +450,30 @@ unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
self.offsets().buffer()
}
}

impl<O: Offset> Splitable for BinaryArray<O> {
#[inline(always)]
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
values: self.values.clone(),
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
values: self.values.clone(),
validity: rhs_validity,
},
)
}
}
48 changes: 48 additions & 0 deletions crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
pub use view::{View, INLINE_VIEW_SIZE};

use super::Splitable;

pub type MutablePlString = MutableBinaryViewArray<str>;
pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;

Expand Down Expand Up @@ -476,6 +478,16 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
self.validity.as_ref()
}

fn split_at_boxed(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
let (lhs, rhs) = Splitable::split_at(self, offset);
(Box::new(lhs), Box::new(rhs))
}

unsafe fn split_at_boxed_unchecked(&self, offset: usize) -> (Box<dyn Array>, Box<dyn Array>) {
let (lhs, rhs) = unsafe { Splitable::split_at_unchecked(self, offset) };
(Box::new(lhs), Box::new(rhs))
}

fn slice(&mut self, offset: usize, length: usize) {
assert!(
offset + length <= self.len(),
Expand Down Expand Up @@ -505,3 +517,39 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
Box::new(self.clone())
}
}

impl<T: ViewType + ?Sized> Splitable for BinaryViewArrayGeneric<T> {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_views, rhs_views) = unsafe { self.views.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

unsafe {
(
Self::new_unchecked(
self.data_type.clone(),
lhs_views,
self.buffers.clone(),
lhs_validity,
if offset == 0 { 0 } else { UNKNOWN_LEN as _ },
self.total_buffer_len(),
),
Self::new_unchecked(
self.data_type.clone(),
rhs_views,
self.buffers.clone(),
rhs_validity,
if offset == self.len() {
0
} else {
UNKNOWN_LEN as _
},
self.total_buffer_len(),
),
)
}
}
}
26 changes: 25 additions & 1 deletion crates/polars-arrow/src/array/boolean/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use either::Either;

use super::Array;
use super::{Array, Splitable};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
use crate::bitmap::{Bitmap, MutableBitmap};
Expand Down Expand Up @@ -390,6 +390,30 @@ impl Array for BooleanArray {
}
}

impl Splitable for BooleanArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) = unsafe { self.values.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
},
)
}
}

impl From<Bitmap> for BooleanArray {
fn from(values: Bitmap) -> Self {
Self {
Expand Down
25 changes: 24 additions & 1 deletion crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use polars_error::{polars_bail, PolarsResult};

use super::primitive::PrimitiveArray;
use super::specification::check_indexes;
use super::{new_empty_array, new_null_array, Array};
use super::{new_empty_array, new_null_array, Array, Splitable};
use crate::array::dictionary::typed_iterator::{
DictValue, DictionaryIterTyped, DictionaryValuesIterTyped,
};
Expand Down Expand Up @@ -398,3 +398,26 @@ impl<K: DictionaryKey> Array for DictionaryArray<K> {
Box::new(self.clone().with_validity(validity))
}
}

impl<K: DictionaryKey> Splitable for DictionaryArray<K> {
fn check_bound(&self, offset: usize) -> bool {
offset < self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_keys, rhs_keys) = unsafe { Splitable::split_at_unchecked(&self.keys, offset) };

(
Self {
data_type: self.data_type.clone(),
keys: lhs_keys,
values: self.values.clone(),
},
Self {
data_type: self.data_type.clone(),
keys: rhs_keys,
values: self.values.clone(),
},
)
}
}
30 changes: 29 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::Array;
use super::{Array, Splitable};
use crate::bitmap::Bitmap;
use crate::buffer::Buffer;
use crate::datatypes::ArrowDataType;
Expand Down Expand Up @@ -235,6 +235,34 @@ impl Array for FixedSizeBinaryArray {
}
}

impl Splitable for FixedSizeBinaryArray {
fn check_bound(&self, offset: usize) -> bool {
offset < self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) = unsafe { self.values.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

let size = self.size;

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
size,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
size,
},
)
}
}

impl FixedSizeBinaryArray {
/// Creates a [`FixedSizeBinaryArray`] from an fallible iterator of optional `[u8]`.
pub fn try_from_iter<P: AsRef<[u8]>, I: IntoIterator<Item = Option<P>>>(
Expand Down
32 changes: 31 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{new_empty_array, new_null_array, Array};
use super::{new_empty_array, new_null_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};

Expand Down Expand Up @@ -215,3 +215,33 @@ impl Array for FixedSizeListArray {
Box::new(self.clone().with_validity(validity))
}
}

impl Splitable for FixedSizeListArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_values, rhs_values) =
unsafe { self.values.split_at_boxed_unchecked(offset * self.size) };
let (lhs_validity, rhs_validity) =
unsafe { self.validity.split_at_unchecked(offset * self.size) };

let size = self.size;

(
Self {
data_type: self.data_type.clone(),
values: lhs_values,
validity: lhs_validity,
size,
},
Self {
data_type: self.data_type.clone(),
values: rhs_values,
validity: rhs_validity,
size,
},
)
}
}
28 changes: 27 additions & 1 deletion crates/polars-arrow/src/array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::specification::try_check_offsets_bounds;
use super::{new_empty_array, Array};
use super::{new_empty_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};
use crate::offset::{Offset, Offsets, OffsetsBuffer};
Expand Down Expand Up @@ -237,3 +237,29 @@ impl<O: Offset> Array for ListArray<O> {
Box::new(self.clone().with_validity(validity))
}
}

impl<O: Offset> Splitable for ListArray<O> {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
validity: lhs_validity,
values: self.values.clone(),
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
validity: rhs_validity,
values: self.values.clone(),
},
)
}
}
28 changes: 27 additions & 1 deletion crates/polars-arrow/src/array/map/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::specification::try_check_offsets_bounds;
use super::{new_empty_array, Array};
use super::{new_empty_array, Array, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};
use crate::offset::OffsetsBuffer;
Expand Down Expand Up @@ -195,3 +195,29 @@ impl Array for MapArray {
Box::new(self.clone().with_validity(validity))
}
}

impl Splitable for MapArray {
fn check_bound(&self, offset: usize) -> bool {
offset <= self.len()
}

unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };

(
Self {
data_type: self.data_type.clone(),
offsets: lhs_offsets,
field: self.field.clone(),
validity: lhs_validity,
},
Self {
data_type: self.data_type.clone(),
offsets: rhs_offsets,
field: self.field.clone(),
validity: rhs_validity,
},
)
}
}
Loading

0 comments on commit ae70fd4

Please sign in to comment.