From 1b58f786d6458e6e7d8e0eb481d5b890e6f24a09 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 20 May 2024 09:33:51 +0200 Subject: [PATCH] feat(python): Support List types in `Series.to_numpy` (#16315) Co-authored-by: ritchie --- .../polars-core/src/chunked_array/list/mod.rs | 48 ++++++++++++ .../chunked_array/ops/explode_and_offsets.rs | 8 +- py-polars/polars/series/series.py | 12 +-- py-polars/src/series/export.rs | 78 ++++++++++++------- .../interop/numpy/test_to_numpy_series.py | 19 ++++- 5 files changed, 126 insertions(+), 39 deletions(-) diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index 730b10d4a162..5e92a10ac59a 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -47,6 +47,25 @@ impl ListChunked { unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) } } + /// Returns an iterator over the offsets of this chunked array. + /// + /// The offsets are returned as though the array consisted of a single chunk. + pub fn iter_offsets(&self) -> impl Iterator + '_ { + let mut offsets = self.downcast_iter().map(|arr| arr.offsets().iter()); + let first_iter = offsets.next().unwrap(); + + // The first offset doesn't have to be 0, it can be sliced to `n` in the array. + // So we must correct for this. + let correction = first_iter.clone().next().unwrap(); + + OffsetsIterator { + current_offsets_iter: first_iter, + current_adjusted_offset: 0, + offset_adjustment: -correction, + offsets_iters: offsets, + } + } + /// Ignore the list indices and apply `func` to the inner type as [`Series`]. pub fn apply_to_inner( &self, @@ -93,3 +112,32 @@ impl ListChunked { }) } } + +pub struct OffsetsIterator<'a, N> +where + N: Iterator>, +{ + offsets_iters: N, + current_offsets_iter: std::slice::Iter<'a, i64>, + current_adjusted_offset: i64, + offset_adjustment: i64, +} + +impl<'a, N> Iterator for OffsetsIterator<'a, N> +where + N: Iterator>, +{ + type Item = i64; + + fn next(&mut self) -> Option { + if let Some(offset) = self.current_offsets_iter.next() { + self.current_adjusted_offset = offset + self.offset_adjustment; + Some(self.current_adjusted_offset) + } else { + self.current_offsets_iter = self.offsets_iters.next()?; + let first = self.current_offsets_iter.next().unwrap(); + self.offset_adjustment = self.current_adjusted_offset - first; + self.next() + } + } +} diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 5d3a277651dd..0f59c80d4651 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,6 +1,7 @@ use arrow::bitmap::MutableBitmap; use arrow::compute::cast::utf8view_to_utf8; use arrow::compute::take::take_unchecked; +use arrow::offset::OffsetsBuffer; use polars_utils::vec::PushUnchecked; use super::*; @@ -15,9 +16,10 @@ impl ChunkExplode for ListChunked { } fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { - // A list array's memory layout is actually already 'exploded', so we can just take the values array - // of the list. And we also return a slice of the offsets. This slice can be used to find the old - // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation + // A list array's memory layout is actually already 'exploded', so we can just take the + // values array of the list. And we also return a slice of the offsets. This slice can be + // used to find the old list layout or indexes to expand a DataFrame in the same manner as + // the `explode` operation. let ca = self.rechunk(); let listarr: &LargeListArray = ca.downcast_iter().next().unwrap(); let offsets_buf = listarr.offsets().clone(); diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 66f150957584..257d25f7d34b 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4405,8 +4405,8 @@ def is_between( def to_numpy( self, *, - allow_copy: bool = True, writable: bool = False, + allow_copy: bool = True, use_pyarrow: bool = True, zero_copy_only: bool | None = None, ) -> np.ndarray[Any, Any]: @@ -4423,13 +4423,13 @@ def to_numpy( Parameters ---------- - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. writable Ensure the resulting array is writable. This will force a copy of the data - if the array was created without copy, as the underlying Arrow data is + if the array was created without copy as the underlying Arrow data is immutable. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. use_pyarrow First convert to PyArrow, then call `pyarrow.Array.to_numpy `_ @@ -4474,7 +4474,7 @@ def to_numpy( zero_copy_only=not allow_copy, writable=writable ) - return self._s.to_numpy(allow_copy=allow_copy, writable=writable) + return self._s.to_numpy(writable=writable, allow_copy=allow_copy) @unstable() def to_jax(self, device: jax.Device | str | None = None) -> jax.Array: diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 0b69e8adff3e..f0fefa7a2007 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -4,7 +4,7 @@ use polars_core::prelude::*; use pyo3::exceptions::PyValueError; use pyo3::intern; use pyo3::prelude::*; -use pyo3::types::PyList; +use pyo3::types::{PyList, PySlice}; use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter}; use crate::error::PyPolarsErr; @@ -167,39 +167,41 @@ impl PySeries { /// This method copies data only when necessary. Set `allow_copy` to raise an error if copy /// is required. Set `writable` to make sure the resulting array is writable, possibly requiring /// copying the data. - fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult { - if self.series.is_empty() { - // Take this path to ensure a writable array. - // This does not actually copy data for empty Series. - return series_to_numpy_with_copy(py, &self.series); - } + fn to_numpy(&self, py: Python, writable: bool, allow_copy: bool) -> PyResult { + series_to_numpy(py, &self.series, writable, allow_copy) + } +} - if let Some((mut arr, writable_flag)) = - try_series_to_numpy_view(py, &self.series, false, allow_copy) - { - if writable && !writable_flag { - if !allow_copy { - return Err(PyValueError::new_err( - "cannot return a zero-copy writable array", - )); - } - arr = arr.call_method0(py, intern!(py, "copy"))?; +/// Convert a Series to a NumPy ndarray. +fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> PyResult { + if s.is_empty() { + // Take this path to ensure a writable array. + // This does not actually copy data for empty Series. + return series_to_numpy_with_copy(py, s, true); + } + if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) { + if writable && !writable_flag { + if !allow_copy { + return Err(PyValueError::new_err( + "cannot return a zero-copy writable array", + )); } - return Ok(arr); - } - - if !allow_copy { - return Err(PyValueError::new_err("cannot return a zero-copy array")); + arr = arr.call_method0(py, intern!(py, "copy"))?; } + return Ok(arr); + } - series_to_numpy_with_copy(py, &self.series) + if !allow_copy { + return Err(PyValueError::new_err("cannot return a zero-copy array")); } + + series_to_numpy_with_copy(py, s, writable) } /// Convert a Series to a NumPy ndarray, copying data in the process. /// /// This method will cast integers to floats so that `null = np.nan`. -fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult { +fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyResult { use DataType::*; let out = match s.dtype() { Int8 => numeric_series_to_numpy::(py, s), @@ -267,7 +269,8 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult { let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py)); PyArray1::from_iter_bound(py, values).into_py(py) }, - Array(_, _) => array_series_to_numpy(py, s), + List(_) => list_series_to_numpy(py, s, writable), + Array(_, _) => array_series_to_numpy(py, s, writable), #[cfg(feature = "object")] Object(_, _) => { let ca = s @@ -357,10 +360,10 @@ where PyArray1::::from_iter_bound(py, values).into_py(py) } /// Convert arrays by flattening first, converting the flat Series, and then reshaping. -fn array_series_to_numpy(py: Python, s: &Series) -> PyObject { +fn array_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject { let ca = s.array().unwrap(); let s_inner = ca.get_inner(); - let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap(); + let np_array_flat = series_to_numpy_with_copy(py, &s_inner, writable).unwrap(); // Reshape to the original shape. let DataType::Array(_, width) = s.dtype() else { @@ -368,3 +371,24 @@ fn array_series_to_numpy(py: Python, s: &Series) -> PyObject { }; reshape_numpy_array(py, np_array_flat, ca.len(), *width) } +/// Convert lists by flattening first, converting the flat Series, and then splitting by offsets. +fn list_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject { + let ca = s.list().unwrap(); + let s_inner = ca.get_inner(); + + let np_array_flat = series_to_numpy(py, &s_inner, writable, true).unwrap(); + + // Split the NumPy array into subarrays by offset. + // TODO: Downcast the NumPy array to Rust and split without calling into Python. + let mut offsets = ca.iter_offsets().map(|o| isize::try_from(o).unwrap()); + let mut prev_offset = offsets.next().unwrap(); + let values = offsets.map(|current_offset| { + let slice = PySlice::new_bound(py, prev_offset, current_offset, 1); + prev_offset = current_offset; + np_array_flat + .call_method1(py, "__getitem__", (slice,)) + .unwrap() + }); + + PyArray1::from_iter_bound(py, values).into_py(py) +} diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index f7d78b22d75d..501d127e6c7b 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -183,9 +183,6 @@ def test_series_to_numpy_datetime_with_tz_with_nulls() -> None: (pl.Binary, [b"a", b"bc", b"def"]), (pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]), (pl.Object, [Path(), Path("abc")]), - # TODO: Implement for List types - # (pl.List, [[1], [2, 3]]), - # (pl.List, [["a"], ["b", "c"], []]), ], ) @pytest.mark.parametrize("with_nulls", [False, True]) @@ -275,6 +272,22 @@ def test_series_to_numpy_array_of_arrays() -> None: assert_allow_copy_false_raises(s) +@pytest.mark.parametrize("chunked", [True, False]) +def test_series_to_numpy_list(chunked: bool) -> None: + values = [[1, 2], [3, 4, 5], [6], []] + s = pl.Series(values) + if chunked: + s = pl.concat([s[:2], s[2:]]) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array([np.array(v, dtype=np.int64) for v in values], dtype=np.object_) + for res, exp in zip(result, expected): + assert_array_equal(res, exp) + assert res.flags.writeable == chunked + assert result.dtype == expected.dtype + assert_allow_copy_false_raises(s) + + def test_to_numpy_null() -> None: s = pl.Series([None, None], dtype=pl.Null) result = s.to_numpy(use_pyarrow=False)