From 11fe9d8f223d50b5e3284b924bb969049b0be6c6 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Thu, 16 May 2024 15:05:14 +0200 Subject: [PATCH] fix(python): Fix `Series.to_numpy` for Array types with nulls and nested Arrays (#16230) --- .../src/chunked_array/array/mod.rs | 16 +- .../polars-core/src/chunked_array/list/mod.rs | 15 +- py-polars/polars/series/series.py | 11 +- py-polars/src/series/export.rs | 36 +++-- py-polars/src/to_numpy.rs | 147 ++++++++++++------ .../interop/numpy/test_to_numpy_series.py | 35 ++++- 6 files changed, 160 insertions(+), 100 deletions(-) diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 15fe892d3404..b92255a8f995 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -31,18 +31,10 @@ impl ArrayChunked { /// Get the inner values as `Series` pub fn get_inner(&self) -> Series { - let ca = self.rechunk(); - let field = self.inner_dtype().to_arrow_field("item", true); - let arr = ca.downcast_iter().next().unwrap(); - unsafe { - Series::_try_from_arrow_unchecked_with_md( - self.name(), - vec![(arr.values()).clone()], - &field.data_type, - Some(&field.metadata), - ) - .unwrap() - } + let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect(); + + // SAFETY: Data type of arrays matches because they are chunks from the same array. + unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) } } /// Ignore the list indices and apply `func` to the inner type as [`Series`]. diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs index eba65e8980ab..730b10d4a162 100644 --- a/crates/polars-core/src/chunked_array/list/mod.rs +++ b/crates/polars-core/src/chunked_array/list/mod.rs @@ -41,17 +41,10 @@ impl ListChunked { /// Get the inner values as [`Series`], ignoring the list offsets. pub fn get_inner(&self) -> Series { - let ca = self.rechunk(); - let arr = ca.downcast_iter().next().unwrap(); - // SAFETY: - // Inner dtype is passed correctly - unsafe { - Series::from_chunks_and_dtype_unchecked( - self.name(), - vec![arr.values().clone()], - &ca.inner_dtype(), - ) - } + let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect(); + + // SAFETY: Data type of arrays matches because they are chunks from the same array. + unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) } } /// Ignore the list indices and apply `func` to the inner type as [`Series`]. diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 0ba9239148d5..9d1a939dc3ac 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4460,7 +4460,7 @@ def to_numpy( if ( use_pyarrow and _PYARROW_AVAILABLE - and self.dtype not in (Object, Datetime, Duration, Date, Array) + and self.dtype not in (Date, Datetime, Duration, Array, Object) ): if not allow_copy and self.n_chunks() > 1 and not self.is_empty(): msg = "cannot return a zero-copy array" @@ -4470,15 +4470,6 @@ def to_numpy( zero_copy_only=not allow_copy, writable=writable ) - if self.dtype == Array: - np_array = self.explode().to_numpy( - allow_copy=allow_copy, - writable=writable, - use_pyarrow=use_pyarrow, - ) - np_array.shape = (self.len(), self.dtype.width) # type: ignore[attr-defined] - return np_array - return self._s.to_numpy(allow_copy=allow_copy, writable=writable) def to_torch(self) -> torch.Tensor: diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 8af513cc718b..4903b3df093e 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -9,6 +9,7 @@ use pyo3::types::PyList; use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter}; use crate::error::PyPolarsErr; use crate::prelude::*; +use crate::to_numpy::{reshape_numpy_array, series_to_numpy_view}; use crate::{arrow_interop, raise_err, PySeries}; #[pymethods] @@ -169,20 +170,20 @@ impl PySeries { fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult { if self.series.is_empty() { // Take this path to ensure a writable array. - // This does not actually copy for empty Series. + // This does not actually copy data for empty Series. return series_to_numpy_with_copy(py, &self.series); - } else if self.series.null_count() == 0 { - if let Some(mut arr) = self.to_numpy_view(py) { - if writable { - if !allow_copy { - return Err(PyValueError::new_err( - "cannot return a zero-copy writable array", - )); - } - arr = arr.call_method0(py, intern!(py, "copy"))?; + } + + if let Some(mut arr) = series_to_numpy_view(py, &self.series, false) { + if writable { + if !allow_copy { + return Err(PyValueError::new_err( + "cannot return a zero-copy writable array", + )); } - return Ok(arr); + arr = arr.call_method0(py, intern!(py, "copy"))?; } + return Ok(arr); } if !allow_copy { @@ -264,6 +265,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult { let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py)); PyArray1::from_iter_bound(py, values).into_py(py) }, + Array(_, _) => array_series_to_numpy(py, s), #[cfg(feature = "object")] Object(_, _) => { let ca = s @@ -352,3 +354,15 @@ where let values = ca.iter().map(|v| v.unwrap_or(i64::MIN).into()); PyArray1::::from_iter_bound(py, values).into_py(py) } +/// Convert arrays by flattening first, converting the flat Series, and then reshaping. +fn array_series_to_numpy(py: Python, s: &Series) -> PyObject { + let ca = s.array().unwrap(); + let s_inner = ca.get_inner(); + let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap(); + + // Reshape to the original shape. + let DataType::Array(_, width) = s.dtype() else { + unreachable!() + }; + reshape_numpy_array(py, np_array_flat, ca.len(), *width) +} diff --git a/py-polars/src/to_numpy.rs b/py-polars/src/to_numpy.rs index f17741376216..6c086b0f24c5 100644 --- a/py-polars/src/to_numpy.rs +++ b/py-polars/src/to_numpy.rs @@ -8,7 +8,9 @@ use numpy::{ use polars_core::prelude::*; use polars_core::utils::try_get_supertype; use polars_core::with_match_physical_numeric_polars_type; +use pyo3::intern; use pyo3::prelude::*; +use pyo3::types::PyTuple; use crate::conversion::Wrap; use crate::dataframe::PyDataFrame; @@ -58,59 +60,66 @@ impl PySeries { /// appropriately. #[allow(clippy::wrong_self_convention)] pub fn to_numpy_view(&self, py: Python) -> Option { - // NumPy arrays are always contiguous - if self.series.n_chunks() > 1 { - return None; - } + series_to_numpy_view(py, &self.series, true) + } +} - match self.series.dtype() { - dt if dt.is_numeric() => { - let dims = [self.series.len()].into_dimension(); - let owner = self.clone().into_py(py); // Keep the Series memory alive. - with_match_physical_numeric_polars_type!(dt, |$T| { - let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py); - let ca: &ChunkedArray<$T> = self.series.unpack::<$T>().unwrap(); - let slice = ca.data_views().next().unwrap(); - - let view = unsafe { - create_borrowed_np_array::<_>( - py, - np_dtype, - dims, - flags::NPY_ARRAY_FARRAY_RO, - slice.as_ptr() as _, - owner, - ) - }; - Some(view) - }) - }, - dt @ (DataType::Datetime(_, _) | DataType::Duration(_)) => { - let np_dtype = polars_dtype_to_np_temporal_dtype(py, dt); - - let phys = self.series.to_physical_repr(); - let ca = phys.i64().unwrap(); - let slice = ca.data_views().next().unwrap(); - let dims = [self.series.len()].into_dimension(); - let owner = self.clone().into_py(py); - - let view = unsafe { - create_borrowed_np_array::<_>( - py, - np_dtype, - dims, - flags::NPY_ARRAY_FARRAY_RO, - slice.as_ptr() as _, - owner, - ) - }; - Some(view) - }, - _ => None, - } +pub(crate) fn series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option { + // NumPy arrays are always contiguous + if s.n_chunks() > 1 { + return None; + } + if !allow_nulls && s.null_count() > 0 { + return None; } + let view = match s.dtype() { + dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s), + DataType::Datetime(_, _) | DataType::Duration(_) => temporal_series_to_numpy_view(py, s), + DataType::Array(_, _) => array_series_to_numpy_view(py, s, allow_nulls)?, + _ => return None, + }; + Some(view) +} +fn numeric_series_to_numpy_view(py: Python, s: &Series) -> PyObject { + let dims = [s.len()].into_dimension(); + let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive. + with_match_physical_numeric_polars_type!(s.dtype(), |$T| { + let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py); + let ca: &ChunkedArray<$T> = s.unpack::<$T>().unwrap(); + let slice = ca.data_views().next().unwrap(); + + unsafe { + create_borrowed_np_array::<_>( + py, + np_dtype, + dims, + flags::NPY_ARRAY_FARRAY_RO, + slice.as_ptr() as _, + owner, + ) + } + }) } +fn temporal_series_to_numpy_view(py: Python, s: &Series) -> PyObject { + let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype()); + + let phys = s.to_physical_repr(); + let ca = phys.i64().unwrap(); + let slice = ca.data_views().next().unwrap(); + let dims = [s.len()].into_dimension(); + let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive. + unsafe { + create_borrowed_np_array::<_>( + py, + np_dtype, + dims, + flags::NPY_ARRAY_FARRAY_RO, + slice.as_ptr() as _, + owner, + ) + } +} /// Get the NumPy temporal data type associated with the given Polars [`DataType`]. fn polars_dtype_to_np_temporal_dtype<'a>( py: Python<'a>, @@ -139,6 +148,46 @@ fn polars_dtype_to_np_temporal_dtype<'a>( _ => panic!("only Datetime/Duration inputs supported, got {}", dtype), } } +fn array_series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option { + let ca = s.array().unwrap(); + let s_inner = ca.get_inner(); + let np_array_flat = series_to_numpy_view(py, &s_inner, allow_nulls)?; + + // Reshape to the original shape. + let DataType::Array(_, width) = s.dtype() else { + unreachable!() + }; + let view = reshape_numpy_array(py, np_array_flat, ca.len(), *width); + Some(view) +} +/// Reshape the first dimension of a NumPy array to the given height and width. +pub(crate) fn reshape_numpy_array( + py: Python, + arr: PyObject, + height: usize, + width: usize, +) -> PyObject { + let shape = arr + .getattr(py, intern!(py, "shape")) + .unwrap() + .extract::>(py) + .unwrap(); + + if shape.len() == 1 { + // In this case we can avoid allocating a Vec. + let new_shape = (height, width); + arr.call_method1(py, intern!(py, "reshape"), new_shape) + .unwrap() + } else { + let mut new_shape_vec = vec![height, width]; + for v in &shape[1..] { + new_shape_vec.push(*v) + } + let new_shape = PyTuple::new_bound(py, new_shape_vec); + arr.call_method1(py, intern!(py, "reshape"), new_shape) + .unwrap() + } +} #[pymethods] #[allow(clippy::wrong_self_convention)] diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index b559918ae7d0..3ecd0cf4963a 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -223,13 +223,14 @@ def test_series_to_numpy_bool_with_nulls() -> None: def test_series_to_numpy_array_of_int() -> None: - values = [[1, 2], [3, 4], [5, 6]] - s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) - result = s.to_numpy(use_pyarrow=False) + values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] + s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2)) + result = s.to_numpy(use_pyarrow=False, allow_copy=False) expected = np.array(values) assert_array_equal(result, expected) - assert result.dtype == np.int64 + assert result.dtype == np.int8 + assert result.shape == (2, 2, 3) def test_series_to_numpy_array_of_str() -> None: @@ -240,9 +241,6 @@ def test_series_to_numpy_array_of_str() -> None: assert result.dtype == np.object_ -@pytest.mark.skip( - reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268" -) def test_series_to_numpy_array_with_nulls() -> None: values = [[1, 2], [3, 4], None] s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) @@ -254,6 +252,29 @@ def test_series_to_numpy_array_with_nulls() -> None: assert_allow_copy_false_raises(s) +def test_series_to_numpy_array_with_nested_nulls() -> None: + values = [[None, 2], [3, 4], [5, None]] + s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]]) + assert_array_equal(result, expected) + assert result.dtype == np.float64 + assert_allow_copy_false_raises(s) + + +def test_series_to_numpy_array_of_arrays() -> None: + values = [[[None, 2], [3, 4]], [None, [7, 8]]] + s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]]) + assert_array_equal(result, expected) + assert result.dtype == np.float64 + assert result.shape == (2, 2, 2) + assert_allow_copy_false_raises(s) + + def test_to_numpy_null() -> None: s = pl.Series([None, None], dtype=pl.Null) result = s.to_numpy(use_pyarrow=False)