pola-rs · ritchie46 · May 16, 2024 · May 13, 2024 · May 14, 2024 · May 14, 2024
@@ -99,7 +99,7 @@
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
-from polars.exceptions import ModuleUpgradeRequired, ShapeError
+from polars.exceptions import ComputeError, ModuleUpgradeRequired, ShapeError
 from polars.meta import get_index_type
 from polars.series.array import ArrayNameSpace
 from polars.series.binary import BinaryNameSpace
@@ -1295,7 +1295,7 @@ def __getitem__(
 
     def __getitem__(
         self,
-        item: (int | Series | range | slice | np.ndarray[Any, Any] | list[int]),
+        item: int | Series | range | slice | np.ndarray[Any, Any] | list[int],
     ) -> Any:
         if isinstance(item, Series) and item.dtype.is_integer():
             return self._take_with_series(item._pos_idxs(self.len()))
@@ -1404,13 +1404,10 @@ def __array_ufunc__(
                 raise NotImplementedError(msg)
 
             args: list[int | float | np.ndarray[Any, Any]] = []
-
-            validity_mask = self.is_not_null()
             for arg in inputs:
                 if isinstance(arg, (int, float, np.ndarray)):
                     args.append(arg)
                 elif isinstance(arg, Series):
-                    validity_mask &= arg.is_not_null()
                     args.append(arg.to_physical()._s.to_numpy_view())
                 else:
                     msg = f"unsupported type {type(arg).__name__!r} for {arg!r}"
@@ -1443,6 +1440,15 @@ def __array_ufunc__(
                 else dtype_char_minimum
             )
 
+            # Only generalized ufuncs have a signature set:
+            is_generalized_ufunc = bool(ufunc.signature)
+            if is_generalized_ufunc:
+                # Generalized ufuncs will operate on the whole array, so
+                # missing data can corrupt the results.
+                if self.null_count() > 0:
+                    msg = "Can't pass a Series with missing data to a generalized ufunc, as it might give unexpected results. See https://docs.pola.rs/user-guide/expressions/missing-data/ for suggestions on how to remove or fill in missing data."
+                    raise ComputeError(msg)
+
             f = get_ffi_func("apply_ufunc_<>", numpy_char_code_to_dtype(dtype_char), s)
 
             if f is None:
@@ -1453,12 +1459,24 @@ def __array_ufunc__(
                 raise NotImplementedError(msg)
 
             series = f(lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs))
+            result = self._from_pyseries(series)
+            if is_generalized_ufunc:
+                # In this case we've disallowed passing in missing data, so no
+                # further processing is needed.
+                return result
+
+            # We're using a regular ufunc, that operates value by value. That
+            # means we allowed missing data in the input, so filter it out:
+            validity_mask = self.is_not_null()
+            for arg in inputs:
+                if isinstance(arg, Series):
+                    validity_mask &= arg.is_not_null()
             return (
-                self._from_pyseries(series)
-                .to_frame()
+                result.to_frame()
                 .select(F.when(validity_mask).then(F.col(self.name)))
                 .to_series(0)
             )
+
         else:
             msg = (
                 "only `__call__` is implemented for numpy ufuncs on a Series, got "
@@ -4143,7 +4161,7 @@ def equals(
 
     def cast(
         self,
-        dtype: (PolarsDataType | type[int] | type[float] | type[str] | type[bool]),
+        dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool],
         *,
         strict: bool = True,
     ) -> Self:

@@ -17,6 +17,7 @@ pip
 
 # Interoperability
 numpy
+numba; python_version < '3.13'  # Numba can lag Python releases
 pandas
 pyarrow
 pydantic>=2.0.0

@@ -1,8 +1,9 @@
 from __future__ import annotations
 
-from typing import Any, cast
+from typing import Any, Callable, cast
 
 import numpy as np
+import pytest
 
 import polars as pl
 from polars.testing import assert_frame_equal, assert_series_equal
@@ -130,3 +131,31 @@ def test_ufunc_multiple_expressions() -> None:
 def test_grouped_ufunc() -> None:
     df = pl.DataFrame({"id": ["a", "a", "b", "b"], "values": [0.1, 0.1, -0.1, -0.1]})
     df.group_by("id").agg(pl.col("values").log1p().sum().pipe(np.expm1))
+
+
+def make_gufunc_mean() -> Callable[[pl.Series], pl.Series]:
+    numba = pytest.importorskip("numba")
+
+    @numba.guvectorize([(numba.float64[:], numba.float64[:])], "(n)->(n)")
+    def gufunc_mean(arr, result):  # type: ignore[no-untyped-def]
+        mean = arr.mean()
+        for i in range(len(arr)):
+            result[i] = mean + i
+
+    return gufunc_mean  # type: ignore[no-any-return]
+
+
+def test_generalized_ufunc() -> None:
+    gufunc_mean = make_gufunc_mean()
+    df = pl.DataFrame({"s": [1.0, 2.0, 3.0]})
+    result = df.select([pl.col("s").map_batches(gufunc_mean).alias("result")])
+    expected = pl.DataFrame({"result": [2.0, 3.0, 4.0]})
+    assert_frame_equal(result, expected)
+
+
+def test_grouped_generalized_ufunc() -> None:
+    gufunc_mean = make_gufunc_mean()
+    df = pl.DataFrame({"id": ["a", "a", "b", "b"], "values": [1.0, 2.0, 3.0, 4.0]})
+    result = df.group_by("id").agg(pl.col("values").map_batches(gufunc_mean)).sort("id")
+    expected = pl.DataFrame({"id": ["a", "b"], "values": [[1.5, 2.5], [3.5, 4.5]]})
+    assert_frame_equal(result, expected)
@@ -1,6 +1,7 @@
-from typing import cast
+from typing import Callable, cast
 
 import numpy as np
+import pytest
 from numpy.testing import assert_array_equal
 
 import polars as pl
@@ -119,3 +120,41 @@ def test_numpy_string_array() -> None:
         np.char.capitalize(s_str),
         np.array(["Aa", "Bb", "Cc", "Dd"], dtype="<U2"),
     )
+
+
+def make_add_one() -> Callable[[pl.Series], pl.Series]:
+    numba = pytest.importorskip("numba")
+
+    @numba.guvectorize([(numba.float64[:], numba.float64[:])], "(n)->(n)")
+    def add_one(arr, result):  # type: ignore[no-untyped-def]
+        for i in range(len(arr)):
+            result[i] = arr[i] + 1.0
+
+    return add_one  # type: ignore[no-any-return]
+
+
+def test_generalized_ufunc() -> None:
+    """A generalized ufunc can be called on a pl.Series."""
+    add_one = make_add_one()
+    s_float = pl.Series("f", [1.0, 2.0, 3.0])
+    result = add_one(s_float)
+    assert_series_equal(result, pl.Series("f", [2.0, 3.0, 4.0]))
+
+
+def test_generalized_ufunc_missing_data() -> None:
+    """
+    If a pl.Series is missing data, using a generalized ufunc is not allowed.
+
+    While this particular example isn't necessarily a semantic issue, consider
+    a mean() function running on integers: it will give wrong results if the
+    input is missing data, since NumPy has no way to model missing slots.  In
+    the general case, we can't assume the function will handle missing data
+    correctly.
+    """
+    add_one = make_add_one()
+    s_float = pl.Series("f", [1.0, 2.0, 3.0, None], dtype=pl.Float64)
+    with pytest.raises(
+        pl.ComputeError,
+        match="Can't pass a Series with missing data to a generalized ufunc",
+    ):
+        add_one(s_float)