Skip to content

Commit

Permalink
feat(python): Support List types in Series.to_numpy (#16315)
Browse files Browse the repository at this point in the history
Co-authored-by: ritchie <[email protected]>
  • Loading branch information
stinodego and ritchie46 authored May 20, 2024
1 parent 129c951 commit 1b58f78
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 39 deletions.
48 changes: 48 additions & 0 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,25 @@ impl ListChunked {
unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
}

/// Returns an iterator over the offsets of this chunked array.
///
/// The offsets are returned as though the array consisted of a single chunk.
pub fn iter_offsets(&self) -> impl Iterator<Item = i64> + '_ {
let mut offsets = self.downcast_iter().map(|arr| arr.offsets().iter());
let first_iter = offsets.next().unwrap();

// The first offset doesn't have to be 0, it can be sliced to `n` in the array.
// So we must correct for this.
let correction = first_iter.clone().next().unwrap();

OffsetsIterator {
current_offsets_iter: first_iter,
current_adjusted_offset: 0,
offset_adjustment: -correction,
offsets_iters: offsets,
}
}

/// Ignore the list indices and apply `func` to the inner type as [`Series`].
pub fn apply_to_inner(
&self,
Expand Down Expand Up @@ -93,3 +112,32 @@ impl ListChunked {
})
}
}

pub struct OffsetsIterator<'a, N>
where
N: Iterator<Item = std::slice::Iter<'a, i64>>,
{
offsets_iters: N,
current_offsets_iter: std::slice::Iter<'a, i64>,
current_adjusted_offset: i64,
offset_adjustment: i64,
}

impl<'a, N> Iterator for OffsetsIterator<'a, N>
where
N: Iterator<Item = std::slice::Iter<'a, i64>>,
{
type Item = i64;

fn next(&mut self) -> Option<Self::Item> {
if let Some(offset) = self.current_offsets_iter.next() {
self.current_adjusted_offset = offset + self.offset_adjustment;
Some(self.current_adjusted_offset)
} else {
self.current_offsets_iter = self.offsets_iters.next()?;
let first = self.current_offsets_iter.next().unwrap();
self.offset_adjustment = self.current_adjusted_offset - first;
self.next()
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
use arrow::compute::take::take_unchecked;
use arrow::offset::OffsetsBuffer;
use polars_utils::vec::PushUnchecked;

use super::*;
Expand All @@ -15,9 +16,10 @@ impl ChunkExplode for ListChunked {
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
// A list array's memory layout is actually already 'exploded', so we can just take the
// values array of the list. And we also return a slice of the offsets. This slice can be
// used to find the old list layout or indexes to expand a DataFrame in the same manner as
// the `explode` operation.
let ca = self.rechunk();
let listarr: &LargeListArray = ca.downcast_iter().next().unwrap();
let offsets_buf = listarr.offsets().clone();
Expand Down
12 changes: 6 additions & 6 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4405,8 +4405,8 @@ def is_between(
def to_numpy(
self,
*,
allow_copy: bool = True,
writable: bool = False,
allow_copy: bool = True,
use_pyarrow: bool = True,
zero_copy_only: bool | None = None,
) -> np.ndarray[Any, Any]:
Expand All @@ -4423,13 +4423,13 @@ def to_numpy(
Parameters
----------
allow_copy
Allow memory to be copied to perform the conversion. If set to `False`,
causes conversions that are not zero-copy to fail.
writable
Ensure the resulting array is writable. This will force a copy of the data
if the array was created without copy, as the underlying Arrow data is
if the array was created without copy as the underlying Arrow data is
immutable.
allow_copy
Allow memory to be copied to perform the conversion. If set to `False`,
causes conversions that are not zero-copy to fail.
use_pyarrow
First convert to PyArrow, then call `pyarrow.Array.to_numpy
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
Expand Down Expand Up @@ -4474,7 +4474,7 @@ def to_numpy(
zero_copy_only=not allow_copy, writable=writable
)

return self._s.to_numpy(allow_copy=allow_copy, writable=writable)
return self._s.to_numpy(writable=writable, allow_copy=allow_copy)

@unstable()
def to_jax(self, device: jax.Device | str | None = None) -> jax.Array:
Expand Down
78 changes: 51 additions & 27 deletions py-polars/src/series/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use polars_core::prelude::*;
use pyo3::exceptions::PyValueError;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::PyList;
use pyo3::types::{PyList, PySlice};

use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
use crate::error::PyPolarsErr;
Expand Down Expand Up @@ -167,39 +167,41 @@ impl PySeries {
/// This method copies data only when necessary. Set `allow_copy` to raise an error if copy
/// is required. Set `writable` to make sure the resulting array is writable, possibly requiring
/// copying the data.
fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult<PyObject> {
if self.series.is_empty() {
// Take this path to ensure a writable array.
// This does not actually copy data for empty Series.
return series_to_numpy_with_copy(py, &self.series);
}
fn to_numpy(&self, py: Python, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
series_to_numpy(py, &self.series, writable, allow_copy)
}
}

if let Some((mut arr, writable_flag)) =
try_series_to_numpy_view(py, &self.series, false, allow_copy)
{
if writable && !writable_flag {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
));
}
arr = arr.call_method0(py, intern!(py, "copy"))?;
/// Convert a Series to a NumPy ndarray.
fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
if s.is_empty() {
// Take this path to ensure a writable array.
// This does not actually copy data for empty Series.
return series_to_numpy_with_copy(py, s, true);
}
if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) {
if writable && !writable_flag {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
));
}
return Ok(arr);
}

if !allow_copy {
return Err(PyValueError::new_err("cannot return a zero-copy array"));
arr = arr.call_method0(py, intern!(py, "copy"))?;
}
return Ok(arr);
}

series_to_numpy_with_copy(py, &self.series)
if !allow_copy {
return Err(PyValueError::new_err("cannot return a zero-copy array"));
}

series_to_numpy_with_copy(py, s, writable)
}

/// Convert a Series to a NumPy ndarray, copying data in the process.
///
/// This method will cast integers to floats so that `null = np.nan`.
fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyResult<PyObject> {
use DataType::*;
let out = match s.dtype() {
Int8 => numeric_series_to_numpy::<Int8Type, f32>(py, s),
Expand Down Expand Up @@ -267,7 +269,8 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py));
PyArray1::from_iter_bound(py, values).into_py(py)
},
Array(_, _) => array_series_to_numpy(py, s),
List(_) => list_series_to_numpy(py, s, writable),
Array(_, _) => array_series_to_numpy(py, s, writable),
#[cfg(feature = "object")]
Object(_, _) => {
let ca = s
Expand Down Expand Up @@ -357,14 +360,35 @@ where
PyArray1::<T>::from_iter_bound(py, values).into_py(py)
}
/// Convert arrays by flattening first, converting the flat Series, and then reshaping.
fn array_series_to_numpy(py: Python, s: &Series) -> PyObject {
fn array_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap();
let np_array_flat = series_to_numpy_with_copy(py, &s_inner, writable).unwrap();

// Reshape to the original shape.
let DataType::Array(_, width) = s.dtype() else {
unreachable!()
};
reshape_numpy_array(py, np_array_flat, ca.len(), *width)
}
/// Convert lists by flattening first, converting the flat Series, and then splitting by offsets.
fn list_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
let ca = s.list().unwrap();
let s_inner = ca.get_inner();

let np_array_flat = series_to_numpy(py, &s_inner, writable, true).unwrap();

// Split the NumPy array into subarrays by offset.
// TODO: Downcast the NumPy array to Rust and split without calling into Python.
let mut offsets = ca.iter_offsets().map(|o| isize::try_from(o).unwrap());
let mut prev_offset = offsets.next().unwrap();
let values = offsets.map(|current_offset| {
let slice = PySlice::new_bound(py, prev_offset, current_offset, 1);
prev_offset = current_offset;
np_array_flat
.call_method1(py, "__getitem__", (slice,))
.unwrap()
});

PyArray1::from_iter_bound(py, values).into_py(py)
}
19 changes: 16 additions & 3 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,6 @@ def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:
(pl.Binary, [b"a", b"bc", b"def"]),
(pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]),
(pl.Object, [Path(), Path("abc")]),
# TODO: Implement for List types
# (pl.List, [[1], [2, 3]]),
# (pl.List, [["a"], ["b", "c"], []]),
],
)
@pytest.mark.parametrize("with_nulls", [False, True])
Expand Down Expand Up @@ -275,6 +272,22 @@ def test_series_to_numpy_array_of_arrays() -> None:
assert_allow_copy_false_raises(s)


@pytest.mark.parametrize("chunked", [True, False])
def test_series_to_numpy_list(chunked: bool) -> None:
values = [[1, 2], [3, 4, 5], [6], []]
s = pl.Series(values)
if chunked:
s = pl.concat([s[:2], s[2:]])
result = s.to_numpy(use_pyarrow=False)

expected = np.array([np.array(v, dtype=np.int64) for v in values], dtype=np.object_)
for res, exp in zip(result, expected):
assert_array_equal(res, exp)
assert res.flags.writeable == chunked
assert result.dtype == expected.dtype
assert_allow_copy_false_raises(s)


def test_to_numpy_null() -> None:
s = pl.Series([None, None], dtype=pl.Null)
result = s.to_numpy(use_pyarrow=False)
Expand Down

0 comments on commit 1b58f78

Please sign in to comment.