diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index e45a9301c6f6..0ba9239148d5 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -99,7 +99,7 @@ from polars.dependencies import numpy as np from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa -from polars.exceptions import ModuleUpgradeRequired, ShapeError +from polars.exceptions import ComputeError, ModuleUpgradeRequired, ShapeError from polars.meta import get_index_type from polars.series.array import ArrayNameSpace from polars.series.binary import BinaryNameSpace @@ -1295,7 +1295,7 @@ def __getitem__( def __getitem__( self, - item: (int | Series | range | slice | np.ndarray[Any, Any] | list[int]), + item: int | Series | range | slice | np.ndarray[Any, Any] | list[int], ) -> Any: if isinstance(item, Series) and item.dtype.is_integer(): return self._take_with_series(item._pos_idxs(self.len())) @@ -1404,13 +1404,10 @@ def __array_ufunc__( raise NotImplementedError(msg) args: list[int | float | np.ndarray[Any, Any]] = [] - - validity_mask = self.is_not_null() for arg in inputs: if isinstance(arg, (int, float, np.ndarray)): args.append(arg) elif isinstance(arg, Series): - validity_mask &= arg.is_not_null() args.append(arg.to_physical()._s.to_numpy_view()) else: msg = f"unsupported type {type(arg).__name__!r} for {arg!r}" @@ -1443,6 +1440,15 @@ def __array_ufunc__( else dtype_char_minimum ) + # Only generalized ufuncs have a signature set: + is_generalized_ufunc = bool(ufunc.signature) + if is_generalized_ufunc: + # Generalized ufuncs will operate on the whole array, so + # missing data can corrupt the results. + if self.null_count() > 0: + msg = "Can't pass a Series with missing data to a generalized ufunc, as it might give unexpected results. See https://docs.pola.rs/user-guide/expressions/missing-data/ for suggestions on how to remove or fill in missing data." + raise ComputeError(msg) + f = get_ffi_func("apply_ufunc_<>", numpy_char_code_to_dtype(dtype_char), s) if f is None: @@ -1453,12 +1459,24 @@ def __array_ufunc__( raise NotImplementedError(msg) series = f(lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs)) + result = self._from_pyseries(series) + if is_generalized_ufunc: + # In this case we've disallowed passing in missing data, so no + # further processing is needed. + return result + + # We're using a regular ufunc, that operates value by value. That + # means we allowed missing data in the input, so filter it out: + validity_mask = self.is_not_null() + for arg in inputs: + if isinstance(arg, Series): + validity_mask &= arg.is_not_null() return ( - self._from_pyseries(series) - .to_frame() + result.to_frame() .select(F.when(validity_mask).then(F.col(self.name))) .to_series(0) ) + else: msg = ( "only `__call__` is implemented for numpy ufuncs on a Series, got " @@ -4143,7 +4161,7 @@ def equals( def cast( self, - dtype: (PolarsDataType | type[int] | type[float] | type[str] | type[bool]), + dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = True, ) -> Self: diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 70c366a73e51..4f8f1dbcee99 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -17,6 +17,7 @@ pip # Interoperability numpy +numba; python_version < '3.13' # Numba can lag Python releases pandas pyarrow pydantic>=2.0.0 diff --git a/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py b/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py index 8695d8d7e4b5..739076a4ac5c 100644 --- a/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py +++ b/py-polars/tests/unit/interop/numpy/test_ufunc_expr.py @@ -1,8 +1,9 @@ from __future__ import annotations -from typing import Any, cast +from typing import Any, Callable, cast import numpy as np +import pytest import polars as pl from polars.testing import assert_frame_equal, assert_series_equal @@ -130,3 +131,31 @@ def test_ufunc_multiple_expressions() -> None: def test_grouped_ufunc() -> None: df = pl.DataFrame({"id": ["a", "a", "b", "b"], "values": [0.1, 0.1, -0.1, -0.1]}) df.group_by("id").agg(pl.col("values").log1p().sum().pipe(np.expm1)) + + +def make_gufunc_mean() -> Callable[[pl.Series], pl.Series]: + numba = pytest.importorskip("numba") + + @numba.guvectorize([(numba.float64[:], numba.float64[:])], "(n)->(n)") + def gufunc_mean(arr, result): # type: ignore[no-untyped-def] + mean = arr.mean() + for i in range(len(arr)): + result[i] = mean + i + + return gufunc_mean # type: ignore[no-any-return] + + +def test_generalized_ufunc() -> None: + gufunc_mean = make_gufunc_mean() + df = pl.DataFrame({"s": [1.0, 2.0, 3.0]}) + result = df.select([pl.col("s").map_batches(gufunc_mean).alias("result")]) + expected = pl.DataFrame({"result": [2.0, 3.0, 4.0]}) + assert_frame_equal(result, expected) + + +def test_grouped_generalized_ufunc() -> None: + gufunc_mean = make_gufunc_mean() + df = pl.DataFrame({"id": ["a", "a", "b", "b"], "values": [1.0, 2.0, 3.0, 4.0]}) + result = df.group_by("id").agg(pl.col("values").map_batches(gufunc_mean)).sort("id") + expected = pl.DataFrame({"id": ["a", "b"], "values": [[1.5, 2.5], [3.5, 4.5]]}) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/interop/numpy/test_ufunc_series.py b/py-polars/tests/unit/interop/numpy/test_ufunc_series.py index 917b54c9eba2..9c8625e55f12 100644 --- a/py-polars/tests/unit/interop/numpy/test_ufunc_series.py +++ b/py-polars/tests/unit/interop/numpy/test_ufunc_series.py @@ -1,6 +1,7 @@ -from typing import cast +from typing import Callable, cast import numpy as np +import pytest from numpy.testing import assert_array_equal import polars as pl @@ -119,3 +120,41 @@ def test_numpy_string_array() -> None: np.char.capitalize(s_str), np.array(["Aa", "Bb", "Cc", "Dd"], dtype=" Callable[[pl.Series], pl.Series]: + numba = pytest.importorskip("numba") + + @numba.guvectorize([(numba.float64[:], numba.float64[:])], "(n)->(n)") + def add_one(arr, result): # type: ignore[no-untyped-def] + for i in range(len(arr)): + result[i] = arr[i] + 1.0 + + return add_one # type: ignore[no-any-return] + + +def test_generalized_ufunc() -> None: + """A generalized ufunc can be called on a pl.Series.""" + add_one = make_add_one() + s_float = pl.Series("f", [1.0, 2.0, 3.0]) + result = add_one(s_float) + assert_series_equal(result, pl.Series("f", [2.0, 3.0, 4.0])) + + +def test_generalized_ufunc_missing_data() -> None: + """ + If a pl.Series is missing data, using a generalized ufunc is not allowed. + + While this particular example isn't necessarily a semantic issue, consider + a mean() function running on integers: it will give wrong results if the + input is missing data, since NumPy has no way to model missing slots. In + the general case, we can't assume the function will handle missing data + correctly. + """ + add_one = make_add_one() + s_float = pl.Series("f", [1.0, 2.0, 3.0, None], dtype=pl.Float64) + with pytest.raises( + pl.ComputeError, + match="Can't pass a Series with missing data to a generalized ufunc", + ): + add_one(s_float)