From 11fe9d8f223d50b5e3284b924bb969049b0be6c6 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Thu, 16 May 2024 15:05:14 +0200
Subject: [PATCH] fix(python): Fix `Series.to_numpy` for Array types with nulls
 and nested Arrays (#16230)

---
 .../src/chunked_array/array/mod.rs            |  16 +-
 .../polars-core/src/chunked_array/list/mod.rs |  15 +-
 py-polars/polars/series/series.py             |  11 +-
 py-polars/src/series/export.rs                |  36 +++--
 py-polars/src/to_numpy.rs                     | 147 ++++++++++++------
 .../interop/numpy/test_to_numpy_series.py     |  35 ++++-
 6 files changed, 160 insertions(+), 100 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs
index 15fe892d3404..b92255a8f995 100644
--- a/crates/polars-core/src/chunked_array/array/mod.rs
+++ b/crates/polars-core/src/chunked_array/array/mod.rs
@@ -31,18 +31,10 @@ impl ArrayChunked {
 
     /// Get the inner values as `Series`
     pub fn get_inner(&self) -> Series {
-        let ca = self.rechunk();
-        let field = self.inner_dtype().to_arrow_field("item", true);
-        let arr = ca.downcast_iter().next().unwrap();
-        unsafe {
-            Series::_try_from_arrow_unchecked_with_md(
-                self.name(),
-                vec![(arr.values()).clone()],
-                &field.data_type,
-                Some(&field.metadata),
-            )
-            .unwrap()
-        }
+        let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
+
+        // SAFETY: Data type of arrays matches because they are chunks from the same array.
+        unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
     }
 
     /// Ignore the list indices and apply `func` to the inner type as [`Series`].
diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs
index eba65e8980ab..730b10d4a162 100644
--- a/crates/polars-core/src/chunked_array/list/mod.rs
+++ b/crates/polars-core/src/chunked_array/list/mod.rs
@@ -41,17 +41,10 @@ impl ListChunked {
 
     /// Get the inner values as [`Series`], ignoring the list offsets.
     pub fn get_inner(&self) -> Series {
-        let ca = self.rechunk();
-        let arr = ca.downcast_iter().next().unwrap();
-        // SAFETY:
-        // Inner dtype is passed correctly
-        unsafe {
-            Series::from_chunks_and_dtype_unchecked(
-                self.name(),
-                vec![arr.values().clone()],
-                &ca.inner_dtype(),
-            )
-        }
+        let chunks: Vec<_> = self.downcast_iter().map(|c| c.values().clone()).collect();
+
+        // SAFETY: Data type of arrays matches because they are chunks from the same array.
+        unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
     }
 
     /// Ignore the list indices and apply `func` to the inner type as [`Series`].
diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
index 0ba9239148d5..9d1a939dc3ac 100644
--- a/py-polars/polars/series/series.py
+++ b/py-polars/polars/series/series.py
@@ -4460,7 +4460,7 @@ def to_numpy(
         if (
             use_pyarrow
             and _PYARROW_AVAILABLE
-            and self.dtype not in (Object, Datetime, Duration, Date, Array)
+            and self.dtype not in (Date, Datetime, Duration, Array, Object)
         ):
             if not allow_copy and self.n_chunks() > 1 and not self.is_empty():
                 msg = "cannot return a zero-copy array"
@@ -4470,15 +4470,6 @@ def to_numpy(
                 zero_copy_only=not allow_copy, writable=writable
             )
 
-        if self.dtype == Array:
-            np_array = self.explode().to_numpy(
-                allow_copy=allow_copy,
-                writable=writable,
-                use_pyarrow=use_pyarrow,
-            )
-            np_array.shape = (self.len(), self.dtype.width)  # type: ignore[attr-defined]
-            return np_array
-
         return self._s.to_numpy(allow_copy=allow_copy, writable=writable)
 
     def to_torch(self) -> torch.Tensor:
diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs
index 8af513cc718b..4903b3df093e 100644
--- a/py-polars/src/series/export.rs
+++ b/py-polars/src/series/export.rs
@@ -9,6 +9,7 @@ use pyo3::types::PyList;
 use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
 use crate::error::PyPolarsErr;
 use crate::prelude::*;
+use crate::to_numpy::{reshape_numpy_array, series_to_numpy_view};
 use crate::{arrow_interop, raise_err, PySeries};
 
 #[pymethods]
@@ -169,20 +170,20 @@ impl PySeries {
     fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult<PyObject> {
         if self.series.is_empty() {
             // Take this path to ensure a writable array.
-            // This does not actually copy for empty Series.
+            // This does not actually copy data for empty Series.
             return series_to_numpy_with_copy(py, &self.series);
-        } else if self.series.null_count() == 0 {
-            if let Some(mut arr) = self.to_numpy_view(py) {
-                if writable {
-                    if !allow_copy {
-                        return Err(PyValueError::new_err(
-                            "cannot return a zero-copy writable array",
-                        ));
-                    }
-                    arr = arr.call_method0(py, intern!(py, "copy"))?;
+        }
+
+        if let Some(mut arr) = series_to_numpy_view(py, &self.series, false) {
+            if writable {
+                if !allow_copy {
+                    return Err(PyValueError::new_err(
+                        "cannot return a zero-copy writable array",
+                    ));
                 }
-                return Ok(arr);
+                arr = arr.call_method0(py, intern!(py, "copy"))?;
             }
+            return Ok(arr);
         }
 
         if !allow_copy {
@@ -264,6 +265,7 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
             let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py));
             PyArray1::from_iter_bound(py, values).into_py(py)
         },
+        Array(_, _) => array_series_to_numpy(py, s),
         #[cfg(feature = "object")]
         Object(_, _) => {
             let ca = s
@@ -352,3 +354,15 @@ where
     let values = ca.iter().map(|v| v.unwrap_or(i64::MIN).into());
     PyArray1::<T>::from_iter_bound(py, values).into_py(py)
 }
+/// Convert arrays by flattening first, converting the flat Series, and then reshaping.
+fn array_series_to_numpy(py: Python, s: &Series) -> PyObject {
+    let ca = s.array().unwrap();
+    let s_inner = ca.get_inner();
+    let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap();
+
+    // Reshape to the original shape.
+    let DataType::Array(_, width) = s.dtype() else {
+        unreachable!()
+    };
+    reshape_numpy_array(py, np_array_flat, ca.len(), *width)
+}
diff --git a/py-polars/src/to_numpy.rs b/py-polars/src/to_numpy.rs
index f17741376216..6c086b0f24c5 100644
--- a/py-polars/src/to_numpy.rs
+++ b/py-polars/src/to_numpy.rs
@@ -8,7 +8,9 @@ use numpy::{
 use polars_core::prelude::*;
 use polars_core::utils::try_get_supertype;
 use polars_core::with_match_physical_numeric_polars_type;
+use pyo3::intern;
 use pyo3::prelude::*;
+use pyo3::types::PyTuple;
 
 use crate::conversion::Wrap;
 use crate::dataframe::PyDataFrame;
@@ -58,59 +60,66 @@ impl PySeries {
     /// appropriately.
     #[allow(clippy::wrong_self_convention)]
     pub fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
-        // NumPy arrays are always contiguous
-        if self.series.n_chunks() > 1 {
-            return None;
-        }
+        series_to_numpy_view(py, &self.series, true)
+    }
+}
 
-        match self.series.dtype() {
-            dt if dt.is_numeric() => {
-                let dims = [self.series.len()].into_dimension();
-                let owner = self.clone().into_py(py); // Keep the Series memory alive.
-                with_match_physical_numeric_polars_type!(dt, |$T| {
-                    let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
-                    let ca: &ChunkedArray<$T> = self.series.unpack::<$T>().unwrap();
-                    let slice = ca.data_views().next().unwrap();
-
-                    let view = unsafe {
-                        create_borrowed_np_array::<_>(
-                            py,
-                            np_dtype,
-                            dims,
-                            flags::NPY_ARRAY_FARRAY_RO,
-                            slice.as_ptr() as _,
-                            owner,
-                        )
-                    };
-                    Some(view)
-                })
-            },
-            dt @ (DataType::Datetime(_, _) | DataType::Duration(_)) => {
-                let np_dtype = polars_dtype_to_np_temporal_dtype(py, dt);
-
-                let phys = self.series.to_physical_repr();
-                let ca = phys.i64().unwrap();
-                let slice = ca.data_views().next().unwrap();
-                let dims = [self.series.len()].into_dimension();
-                let owner = self.clone().into_py(py);
-
-                let view = unsafe {
-                    create_borrowed_np_array::<_>(
-                        py,
-                        np_dtype,
-                        dims,
-                        flags::NPY_ARRAY_FARRAY_RO,
-                        slice.as_ptr() as _,
-                        owner,
-                    )
-                };
-                Some(view)
-            },
-            _ => None,
-        }
+pub(crate) fn series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
+    // NumPy arrays are always contiguous
+    if s.n_chunks() > 1 {
+        return None;
+    }
+    if !allow_nulls && s.null_count() > 0 {
+        return None;
     }
+    let view = match s.dtype() {
+        dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s),
+        DataType::Datetime(_, _) | DataType::Duration(_) => temporal_series_to_numpy_view(py, s),
+        DataType::Array(_, _) => array_series_to_numpy_view(py, s, allow_nulls)?,
+        _ => return None,
+    };
+    Some(view)
+}
+fn numeric_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
+    let dims = [s.len()].into_dimension();
+    let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
+    with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
+        let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
+        let ca: &ChunkedArray<$T> = s.unpack::<$T>().unwrap();
+        let slice = ca.data_views().next().unwrap();
+
+        unsafe {
+            create_borrowed_np_array::<_>(
+                py,
+                np_dtype,
+                dims,
+                flags::NPY_ARRAY_FARRAY_RO,
+                slice.as_ptr() as _,
+                owner,
+            )
+        }
+    })
 }
+fn temporal_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
+    let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
+
+    let phys = s.to_physical_repr();
+    let ca = phys.i64().unwrap();
+    let slice = ca.data_views().next().unwrap();
+    let dims = [s.len()].into_dimension();
+    let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
 
+    unsafe {
+        create_borrowed_np_array::<_>(
+            py,
+            np_dtype,
+            dims,
+            flags::NPY_ARRAY_FARRAY_RO,
+            slice.as_ptr() as _,
+            owner,
+        )
+    }
+}
 /// Get the NumPy temporal data type associated with the given Polars [`DataType`].
 fn polars_dtype_to_np_temporal_dtype<'a>(
     py: Python<'a>,
@@ -139,6 +148,46 @@ fn polars_dtype_to_np_temporal_dtype<'a>(
         _ => panic!("only Datetime/Duration inputs supported, got {}", dtype),
     }
 }
+fn array_series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
+    let ca = s.array().unwrap();
+    let s_inner = ca.get_inner();
+    let np_array_flat = series_to_numpy_view(py, &s_inner, allow_nulls)?;
+
+    // Reshape to the original shape.
+    let DataType::Array(_, width) = s.dtype() else {
+        unreachable!()
+    };
+    let view = reshape_numpy_array(py, np_array_flat, ca.len(), *width);
+    Some(view)
+}
+/// Reshape the first dimension of a NumPy array to the given height and width.
+pub(crate) fn reshape_numpy_array(
+    py: Python,
+    arr: PyObject,
+    height: usize,
+    width: usize,
+) -> PyObject {
+    let shape = arr
+        .getattr(py, intern!(py, "shape"))
+        .unwrap()
+        .extract::<Vec<usize>>(py)
+        .unwrap();
+
+    if shape.len() == 1 {
+        // In this case we can avoid allocating a Vec.
+        let new_shape = (height, width);
+        arr.call_method1(py, intern!(py, "reshape"), new_shape)
+            .unwrap()
+    } else {
+        let mut new_shape_vec = vec![height, width];
+        for v in &shape[1..] {
+            new_shape_vec.push(*v)
+        }
+        let new_shape = PyTuple::new_bound(py, new_shape_vec);
+        arr.call_method1(py, intern!(py, "reshape"), new_shape)
+            .unwrap()
+    }
+}
 
 #[pymethods]
 #[allow(clippy::wrong_self_convention)]
diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
index b559918ae7d0..3ecd0cf4963a 100644
--- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
+++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
@@ -223,13 +223,14 @@ def test_series_to_numpy_bool_with_nulls() -> None:
 
 
 def test_series_to_numpy_array_of_int() -> None:
-    values = [[1, 2], [3, 4], [5, 6]]
-    s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
-    result = s.to_numpy(use_pyarrow=False)
+    values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+    s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2))
+    result = s.to_numpy(use_pyarrow=False, allow_copy=False)
 
     expected = np.array(values)
     assert_array_equal(result, expected)
-    assert result.dtype == np.int64
+    assert result.dtype == np.int8
+    assert result.shape == (2, 2, 3)
 
 
 def test_series_to_numpy_array_of_str() -> None:
@@ -240,9 +241,6 @@ def test_series_to_numpy_array_of_str() -> None:
     assert result.dtype == np.object_
 
 
-@pytest.mark.skip(
-    reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268"
-)
 def test_series_to_numpy_array_with_nulls() -> None:
     values = [[1, 2], [3, 4], None]
     s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
@@ -254,6 +252,29 @@ def test_series_to_numpy_array_with_nulls() -> None:
     assert_allow_copy_false_raises(s)
 
 
+def test_series_to_numpy_array_with_nested_nulls() -> None:
+    values = [[None, 2], [3, 4], [5, None]]
+    s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
+    result = s.to_numpy(use_pyarrow=False)
+
+    expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]])
+    assert_array_equal(result, expected)
+    assert result.dtype == np.float64
+    assert_allow_copy_false_raises(s)
+
+
+def test_series_to_numpy_array_of_arrays() -> None:
+    values = [[[None, 2], [3, 4]], [None, [7, 8]]]
+    s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2))
+    result = s.to_numpy(use_pyarrow=False)
+
+    expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]])
+    assert_array_equal(result, expected)
+    assert result.dtype == np.float64
+    assert result.shape == (2, 2, 2)
+    assert_allow_copy_false_raises(s)
+
+
 def test_to_numpy_null() -> None:
     s = pl.Series([None, None], dtype=pl.Null)
     result = s.to_numpy(use_pyarrow=False)