feat(python): Support List types in Series.to_numpy (#16315)

Co-authored-by: ritchie <[email protected]>
pola-rs · May 20, 2024 · 1b58f78 · 1b58f78
1 parent 129c951
commit 1b58f78
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 39 deletions.
diff --git a/crates/polars-core/src/chunked_array/list/mod.rs b/crates/polars-core/src/chunked_array/list/mod.rs
@@ -47,6 +47,25 @@ impl ListChunked {
         unsafe { Series::from_chunks_and_dtype_unchecked(self.name(), chunks, &self.inner_dtype()) }
     }
 
+    /// Returns an iterator over the offsets of this chunked array.
+    ///
+    /// The offsets are returned as though the array consisted of a single chunk.
+    pub fn iter_offsets(&self) -> impl Iterator<Item = i64> + '_ {
+        let mut offsets = self.downcast_iter().map(|arr| arr.offsets().iter());
+        let first_iter = offsets.next().unwrap();
+
+        // The first offset doesn't have to be 0, it can be sliced to `n` in the array.
+        // So we must correct for this.
+        let correction = first_iter.clone().next().unwrap();
+
+        OffsetsIterator {
+            current_offsets_iter: first_iter,
+            current_adjusted_offset: 0,
+            offset_adjustment: -correction,
+            offsets_iters: offsets,
+        }
+    }
+
     /// Ignore the list indices and apply `func` to the inner type as [`Series`].
     pub fn apply_to_inner(
         &self,
@@ -93,3 +112,32 @@ impl ListChunked {
         })
     }
 }
+
+pub struct OffsetsIterator<'a, N>
+where
+    N: Iterator<Item = std::slice::Iter<'a, i64>>,
+{
+    offsets_iters: N,
+    current_offsets_iter: std::slice::Iter<'a, i64>,
+    current_adjusted_offset: i64,
+    offset_adjustment: i64,
+}
+
+impl<'a, N> Iterator for OffsetsIterator<'a, N>
+where
+    N: Iterator<Item = std::slice::Iter<'a, i64>>,
+{
+    type Item = i64;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(offset) = self.current_offsets_iter.next() {
+            self.current_adjusted_offset = offset + self.offset_adjustment;
+            Some(self.current_adjusted_offset)
+        } else {
+            self.current_offsets_iter = self.offsets_iters.next()?;
+            let first = self.current_offsets_iter.next().unwrap();
+            self.offset_adjustment = self.current_adjusted_offset - first;
+            self.next()
+        }
+    }
+}
diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
@@ -1,6 +1,7 @@
 use arrow::bitmap::MutableBitmap;
 use arrow::compute::cast::utf8view_to_utf8;
 use arrow::compute::take::take_unchecked;
+use arrow::offset::OffsetsBuffer;
 use polars_utils::vec::PushUnchecked;
 
 use super::*;
@@ -15,9 +16,10 @@ impl ChunkExplode for ListChunked {
     }
 
     fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
-        // A list array's memory layout is actually already 'exploded', so we can just take the values array
-        // of the list. And we also return a slice of the offsets. This slice can be used to find the old
-        // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
+        // A list array's memory layout is actually already 'exploded', so we can just take the
+        // values array of the list. And we also return a slice of the offsets. This slice can be
+        // used to find the old list layout or indexes to expand a DataFrame in the same manner as
+        // the `explode` operation.
         let ca = self.rechunk();
         let listarr: &LargeListArray = ca.downcast_iter().next().unwrap();
         let offsets_buf = listarr.offsets().clone();

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -4405,8 +4405,8 @@ def is_between(
     def to_numpy(
         self,
         *,
-        allow_copy: bool = True,
         writable: bool = False,
+        allow_copy: bool = True,
         use_pyarrow: bool = True,
         zero_copy_only: bool | None = None,
     ) -> np.ndarray[Any, Any]:
@@ -4423,13 +4423,13 @@ def to_numpy(
 
         Parameters
         ----------
-        allow_copy
-            Allow memory to be copied to perform the conversion. If set to `False`,
-            causes conversions that are not zero-copy to fail.
         writable
             Ensure the resulting array is writable. This will force a copy of the data
-            if the array was created without copy, as the underlying Arrow data is
+            if the array was created without copy as the underlying Arrow data is
             immutable.
+        allow_copy
+            Allow memory to be copied to perform the conversion. If set to `False`,
+            causes conversions that are not zero-copy to fail.
         use_pyarrow
             First convert to PyArrow, then call `pyarrow.Array.to_numpy
             <https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
@@ -4474,7 +4474,7 @@ def to_numpy(
                 zero_copy_only=not allow_copy, writable=writable
             )
 
-        return self._s.to_numpy(allow_copy=allow_copy, writable=writable)
+        return self._s.to_numpy(writable=writable, allow_copy=allow_copy)
 
     @unstable()
     def to_jax(self, device: jax.Device | str | None = None) -> jax.Array:

diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs
@@ -4,7 +4,7 @@ use polars_core::prelude::*;
 use pyo3::exceptions::PyValueError;
 use pyo3::intern;
 use pyo3::prelude::*;
-use pyo3::types::PyList;
+use pyo3::types::{PyList, PySlice};
 
 use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
 use crate::error::PyPolarsErr;
@@ -167,39 +167,41 @@ impl PySeries {
     /// This method copies data only when necessary. Set `allow_copy` to raise an error if copy
     /// is required. Set `writable` to make sure the resulting array is writable, possibly requiring
     /// copying the data.
-    fn to_numpy(&self, py: Python, allow_copy: bool, writable: bool) -> PyResult<PyObject> {
-        if self.series.is_empty() {
-            // Take this path to ensure a writable array.
-            // This does not actually copy data for empty Series.
-            return series_to_numpy_with_copy(py, &self.series);
-        }
+    fn to_numpy(&self, py: Python, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
+        series_to_numpy(py, &self.series, writable, allow_copy)
+    }
+}
 
-        if let Some((mut arr, writable_flag)) =
-            try_series_to_numpy_view(py, &self.series, false, allow_copy)
-        {
-            if writable && !writable_flag {
-                if !allow_copy {
-                    return Err(PyValueError::new_err(
-                        "cannot return a zero-copy writable array",
-                    ));
-                }
-                arr = arr.call_method0(py, intern!(py, "copy"))?;
+/// Convert a Series to a NumPy ndarray.
+fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
+    if s.is_empty() {
+        // Take this path to ensure a writable array.
+        // This does not actually copy data for empty Series.
+        return series_to_numpy_with_copy(py, s, true);
+    }
+    if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) {
+        if writable && !writable_flag {
+            if !allow_copy {
+                return Err(PyValueError::new_err(
+                    "cannot return a zero-copy writable array",
+                ));
             }
-            return Ok(arr);
-        }
-
-        if !allow_copy {
-            return Err(PyValueError::new_err("cannot return a zero-copy array"));
+            arr = arr.call_method0(py, intern!(py, "copy"))?;
         }
+        return Ok(arr);
+    }
 
-        series_to_numpy_with_copy(py, &self.series)
+    if !allow_copy {
+        return Err(PyValueError::new_err("cannot return a zero-copy array"));
     }
+
+    series_to_numpy_with_copy(py, s, writable)
 }
 
 /// Convert a Series to a NumPy ndarray, copying data in the process.
 ///
 /// This method will cast integers to floats so that `null = np.nan`.
-fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
+fn series_to_numpy_with_copy(py: Python, s: &Series, writable: bool) -> PyResult<PyObject> {
     use DataType::*;
     let out = match s.dtype() {
         Int8 => numeric_series_to_numpy::<Int8Type, f32>(py, s),
@@ -267,7 +269,8 @@ fn series_to_numpy_with_copy(py: Python, s: &Series) -> PyResult<PyObject> {
             let values = decimal_to_pyobject_iter(py, ca).map(|v| v.into_py(py));
             PyArray1::from_iter_bound(py, values).into_py(py)
         },
-        Array(_, _) => array_series_to_numpy(py, s),
+        List(_) => list_series_to_numpy(py, s, writable),
+        Array(_, _) => array_series_to_numpy(py, s, writable),
         #[cfg(feature = "object")]
         Object(_, _) => {
             let ca = s
@@ -357,14 +360,35 @@ where
     PyArray1::<T>::from_iter_bound(py, values).into_py(py)
 }
 /// Convert arrays by flattening first, converting the flat Series, and then reshaping.
-fn array_series_to_numpy(py: Python, s: &Series) -> PyObject {
+fn array_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
     let ca = s.array().unwrap();
     let s_inner = ca.get_inner();
-    let np_array_flat = series_to_numpy_with_copy(py, &s_inner).unwrap();
+    let np_array_flat = series_to_numpy_with_copy(py, &s_inner, writable).unwrap();
 
     // Reshape to the original shape.
     let DataType::Array(_, width) = s.dtype() else {
         unreachable!()
     };
     reshape_numpy_array(py, np_array_flat, ca.len(), *width)
 }
+/// Convert lists by flattening first, converting the flat Series, and then splitting by offsets.
+fn list_series_to_numpy(py: Python, s: &Series, writable: bool) -> PyObject {
+    let ca = s.list().unwrap();
+    let s_inner = ca.get_inner();
+
+    let np_array_flat = series_to_numpy(py, &s_inner, writable, true).unwrap();
+
+    // Split the NumPy array into subarrays by offset.
+    // TODO: Downcast the NumPy array to Rust and split without calling into Python.
+    let mut offsets = ca.iter_offsets().map(|o| isize::try_from(o).unwrap());
+    let mut prev_offset = offsets.next().unwrap();
+    let values = offsets.map(|current_offset| {
+        let slice = PySlice::new_bound(py, prev_offset, current_offset, 1);
+        prev_offset = current_offset;
+        np_array_flat
+            .call_method1(py, "__getitem__", (slice,))
+            .unwrap()
+    });
+
+    PyArray1::from_iter_bound(py, values).into_py(py)
+}
diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
@@ -183,9 +183,6 @@ def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:
         (pl.Binary, [b"a", b"bc", b"def"]),
         (pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]),
         (pl.Object, [Path(), Path("abc")]),
-        # TODO: Implement for List types
-        # (pl.List, [[1], [2, 3]]),
-        # (pl.List, [["a"], ["b", "c"], []]),
     ],
 )
 @pytest.mark.parametrize("with_nulls", [False, True])
@@ -275,6 +272,22 @@ def test_series_to_numpy_array_of_arrays() -> None:
     assert_allow_copy_false_raises(s)
 
 
+@pytest.mark.parametrize("chunked", [True, False])
+def test_series_to_numpy_list(chunked: bool) -> None:
+    values = [[1, 2], [3, 4, 5], [6], []]
+    s = pl.Series(values)
+    if chunked:
+        s = pl.concat([s[:2], s[2:]])
+    result = s.to_numpy(use_pyarrow=False)
+
+    expected = np.array([np.array(v, dtype=np.int64) for v in values], dtype=np.object_)
+    for res, exp in zip(result, expected):
+        assert_array_equal(res, exp)
+        assert res.flags.writeable == chunked
+    assert result.dtype == expected.dtype
+    assert_allow_copy_false_raises(s)
+
+
 def test_to_numpy_null() -> None:
     s = pl.Series([None, None], dtype=pl.Null)
     result = s.to_numpy(use_pyarrow=False)