Skip to content

Commit

Permalink
fix(python): Fix Series.to_numpy for Array types with nulls and nes…
Browse files Browse the repository at this point in the history
…ted Arrays (#16230)
  • Loading branch information
stinodego authored May 16, 2024
1 parent 092e3da commit 11fe9d8
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 100 deletions.
16 changes: 4 additions & 12 deletions crates/polars-core/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,10 @@ impl ArrayChunked {

/// Get the inner values as `Series`
pub fn get_inner(&self) -> Series {
let ca = self.rechunk();
let field = self.inner_dtype().to_arrow_field("item", true);
let arr = ca.downcast_iter().next().unwrap();
unsafe {
Series::_try_from_arrow_unchecked_with_md(
self.name(),
vec![(arr.values()).clone()],
&field.data_type,
Some(&field.metadata),
)
.unwrap()
}
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();

// SAFETY: Data type of arrays matches because they are chunks from the same array.
unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
}

/// Ignore the list indices and apply `func` to the inner type as [`Series`].
Expand Down
15 changes: 4 additions & 11 deletions crates/polars-core/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,10 @@ impl ListChunked {

/// Get the inner values as [`Series`], ignoring the list offsets.
pub fn get_inner(&self) -> Series {
let ca = self.rechunk();
let arr = ca.downcast_iter().next().unwrap();
// SAFETY:
// Inner dtype is passed correctly
unsafe {
Series::from_chunks_and_dtype_unchecked(
self.name(),
vec![arr.values().clone()],
&ca.inner_dtype(),
)
}
let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();

// SAFETY: Data type of arrays matches because they are chunks from the same array.
unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
}

/// Ignore the list indices and apply `func` to the inner type as [`Series`].
Expand Down
11 changes: 1 addition & 10 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4460,7 +4460,7 @@ def to_numpy(
if (
use_pyarrow
and _PYARROW_AVAILABLE
and self.dtype not in (Object, Datetime, Duration, Date, Array)
and self.dtype not in (Date, Datetime, Duration, Array, Object)
):
if not allow_copy and self.n_chunks() > 1 and not self.is_empty():
msg = "cannot return a zero-copy array"
Expand All @@ -4470,15 +4470,6 @@ def to_numpy(
zero_copy_only=not allow_copy, writable=writable
)

if self.dtype == Array:
np_array = self.explode().to_numpy(
allow_copy=allow_copy,
writable=writable,
use_pyarrow=use_pyarrow,
)
np_array.shape = (self.len(), self.dtype.width) # type: ignore[attr-defined]
return np_array

return self._s.to_numpy(allow_copy=allow_copy, writable=writable)

def to_torch(self) -> torch.Tensor:
Expand Down
36 changes: 25 additions & 11 deletions py-polars/src/series/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use pyo3::types::PyList;
use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
use crate::error::PyPolarsErr;
use crate::prelude::*;
use crate::to_numpy::{reshape_numpy_array, series_to_numpy_view};
use crate::{arrow_interop, raise_err, PySeries};

#[pymethods]
Expand Down Expand Up @@ -169,20 +170,20 @@ impl PySeries {
fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult<PyObject> {
if self.series.is_empty() {
// Take this path to ensure a writable array.
// This does not actually copy for empty Series.
// This does not actually copy data for empty Series.
return series_to_numpy_with_copy(py, &self.series);
} else if self.series.null_count() == 0 {
if let Some(mut arr) = self.to_numpy_view(py) {
if writable {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
));
}
arr = arr.call_method0(py, intern!(py, "copy"))?;
}

if let Some(mut arr) = series_to_numpy_view(py, &self.series, false) {
if writable {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
));
}
return Ok(arr);
arr = arr.call_method0(py, intern!(py, "copy"))?;
}
return Ok(arr);
}

if !allow_copy {
Expand Down Expand Up @@ -264,6 +265,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py));
PyArray1::from_iter_bound(py, values).into_py(py)
},
Array(_, _) => array_series_to_numpy(py, s),
#[cfg(feature = "object")]
Object(_, _) => {
let ca = s
Expand Down Expand Up @@ -352,3 +354,15 @@ where
let values = ca.iter().map(|v| v.unwrap_or(i64::MIN).into());
PyArray1::<T>::from_iter_bound(py, values).into_py(py)
}
/// Convert arrays by flattening first, converting the flat Series, and then reshaping.
fn array_series_to_numpy(py: Python, s: &Series) -> PyObject {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap();

// Reshape to the original shape.
let DataType::Array(_, width) = s.dtype() else {
unreachable!()
};
reshape_numpy_array(py, np_array_flat, ca.len(), *width)
}
147 changes: 98 additions & 49 deletions py-polars/src/to_numpy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ use numpy::{
use polars_core::prelude::*;
use polars_core::utils::try_get_supertype;
use polars_core::with_match_physical_numeric_polars_type;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::PyTuple;

use crate::conversion::Wrap;
use crate::dataframe::PyDataFrame;
Expand Down Expand Up @@ -58,59 +60,66 @@ impl PySeries {
/// appropriately.
#[allow(clippy::wrong_self_convention)]
pub fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
// NumPy arrays are always contiguous
if self.series.n_chunks() > 1 {
return None;
}
series_to_numpy_view(py, &self.series, true)
}
}

match self.series.dtype() {
dt if dt.is_numeric() => {
let dims = [self.series.len()].into_dimension();
let owner = self.clone().into_py(py); // Keep the Series memory alive.
with_match_physical_numeric_polars_type!(dt, |$T| {
let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
let ca: &ChunkedArray<$T> = self.series.unpack::<$T>().unwrap();
let slice = ca.data_views().next().unwrap();

let view = unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
slice.as_ptr() as _,
owner,
)
};
Some(view)
})
},
dt @ (DataType::Datetime(_, _) | DataType::Duration(_)) => {
let np_dtype = polars_dtype_to_np_temporal_dtype(py, dt);

let phys = self.series.to_physical_repr();
let ca = phys.i64().unwrap();
let slice = ca.data_views().next().unwrap();
let dims = [self.series.len()].into_dimension();
let owner = self.clone().into_py(py);

let view = unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
slice.as_ptr() as _,
owner,
)
};
Some(view)
},
_ => None,
}
pub(crate) fn series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
// NumPy arrays are always contiguous
if s.n_chunks() > 1 {
return None;
}
if !allow_nulls && s.null_count() > 0 {
return None;
}
let view = match s.dtype() {
dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s),
DataType::Datetime(_, _) | DataType::Duration(_) => temporal_series_to_numpy_view(py, s),
DataType::Array(_, _) => array_series_to_numpy_view(py, s, allow_nulls)?,
_ => return None,
};
Some(view)
}
fn numeric_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
let dims = [s.len()].into_dimension();
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
let ca: &ChunkedArray<$T> = s.unpack::<$T>().unwrap();
let slice = ca.data_views().next().unwrap();

unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
slice.as_ptr() as _,
owner,
)
}
})
}
fn temporal_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());

let phys = s.to_physical_repr();
let ca = phys.i64().unwrap();
let slice = ca.data_views().next().unwrap();
let dims = [s.len()].into_dimension();
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.

unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
slice.as_ptr() as _,
owner,
)
}
}
/// Get the NumPy temporal data type associated with the given Polars [`DataType`].
fn polars_dtype_to_np_temporal_dtype<'a>(
py: Python<'a>,
Expand Down Expand Up @@ -139,6 +148,46 @@ fn polars_dtype_to_np_temporal_dtype<'a>(
_ => panic!("only Datetime/Duration inputs supported, got {}", dtype),
}
}
fn array_series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
let np_array_flat = series_to_numpy_view(py, &s_inner, allow_nulls)?;

// Reshape to the original shape.
let DataType::Array(_, width) = s.dtype() else {
unreachable!()
};
let view = reshape_numpy_array(py, np_array_flat, ca.len(), *width);
Some(view)
}
/// Reshape the first dimension of a NumPy array to the given height and width.
pub(crate) fn reshape_numpy_array(
py: Python,
arr: PyObject,
height: usize,
width: usize,
) -> PyObject {
let shape = arr
.getattr(py, intern!(py, "shape"))
.unwrap()
.extract::<Vec<usize>>(py)
.unwrap();

if shape.len() == 1 {
// In this case we can avoid allocating a Vec.
let new_shape = (height, width);
arr.call_method1(py, intern!(py, "reshape"), new_shape)
.unwrap()
} else {
let mut new_shape_vec = vec![height, width];
for v in &shape[1..] {
new_shape_vec.push(*v)
}
let new_shape = PyTuple::new_bound(py, new_shape_vec);
arr.call_method1(py, intern!(py, "reshape"), new_shape)
.unwrap()
}
}

#[pymethods]
#[allow(clippy::wrong_self_convention)]
Expand Down
35 changes: 28 additions & 7 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,14 @@ def test_series_to_numpy_bool_with_nulls() -> None:


def test_series_to_numpy_array_of_int() -> None:
values = [[1, 2], [3, 4], [5, 6]]
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
result = s.to_numpy(use_pyarrow=False)
values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2))
result = s.to_numpy(use_pyarrow=False, allow_copy=False)

expected = np.array(values)
assert_array_equal(result, expected)
assert result.dtype == np.int64
assert result.dtype == np.int8
assert result.shape == (2, 2, 3)


def test_series_to_numpy_array_of_str() -> None:
Expand All @@ -240,9 +241,6 @@ def test_series_to_numpy_array_of_str() -> None:
assert result.dtype == np.object_


@pytest.mark.skip(
reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268"
)
def test_series_to_numpy_array_with_nulls() -> None:
values = [[1, 2], [3, 4], None]
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
Expand All @@ -254,6 +252,29 @@ def test_series_to_numpy_array_with_nulls() -> None:
assert_allow_copy_false_raises(s)


def test_series_to_numpy_array_with_nested_nulls() -> None:
values = [[None, 2], [3, 4], [5, None]]
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
result = s.to_numpy(use_pyarrow=False)

expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]])
assert_array_equal(result, expected)
assert result.dtype == np.float64
assert_allow_copy_false_raises(s)


def test_series_to_numpy_array_of_arrays() -> None:
values = [[[None, 2], [3, 4]], [None, [7, 8]]]
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2))
result = s.to_numpy(use_pyarrow=False)

expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]])
assert_array_equal(result, expected)
assert result.dtype == np.float64
assert result.shape == (2, 2, 2)
assert_allow_copy_false_raises(s)


def test_to_numpy_null() -> None:
s = pl.Series([None, None], dtype=pl.Null)
result = s.to_numpy(use_pyarrow=False)
Expand Down

0 comments on commit 11fe9d8

Please sign in to comment.