Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Add Series/Expr.has_nulls and deprecate Series.has_validity #16488

Merged
merged 5 commits into from
May 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/boolean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Boolean

Expr.all
Expr.any
Expr.has_nulls
Expr.is_between
Expr.is_duplicated
Expr.is_finite
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/descriptive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Descriptive
Series.chunk_lengths
Series.describe
Series.estimated_size
Series.has_nulls
Series.has_validity
Series.is_boolean
Series.is_duplicated
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ def to_numpy(
else:
arr = s.to_numpy(use_pyarrow=use_pyarrow)

if s.dtype == String and s.null_count() == 0:
if s.dtype == String and not s.has_nulls():
arr = arr.astype(str, copy=False)
arrays.append(arr)
struct_dtype.append((s.name, arr.dtype))
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def __init__(self, categories: Series | Iterable[str]):
self.categories = pl.Series(name="category", dtype=String)
return

if categories.null_count() > 0:
if categories.has_nulls():
msg = "Enum categories must not contain null values"
raise TypeError(msg)

Expand Down
25 changes: 25 additions & 0 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3462,6 +3462,31 @@ def null_count(self) -> Self:
"""
return self._from_pyexpr(self._pyexpr.null_count())

def has_nulls(self) -> Expr:
"""
Check whether the expression contains one or more null values.

Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [None, 1, None],
... "b": [10, None, 300],
... "c": [350, 650, 850],
... }
... )
>>> df.select(pl.all().has_nulls())
shape: (1, 3)
┌──────┬──────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ bool ┆ bool ┆ bool │
╞══════╪══════╪═══════╡
│ true ┆ true ┆ false │
└──────┴──────┴───────┘
"""
return self.null_count() > 0

def arg_unique(self) -> Self:
"""
Get index of first unique value.
Expand Down
31 changes: 26 additions & 5 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,7 +1215,7 @@ def __deepcopy__(self, memo: None = None) -> Self:

def __contains__(self, item: Any) -> bool:
if item is None:
return self.null_count() > 0
return self.has_nulls()
return self.implode().list.contains(item).item()

def __iter__(self) -> Generator[Any, None, None]:
Expand Down Expand Up @@ -1325,7 +1325,7 @@ def __getitem__(
not item or (isinstance(item[0], int) and not isinstance(item[0], bool)) # type: ignore[redundant-expr]
):
idx_series = Series("", item, dtype=Int64)._pos_idxs(self.len())
if idx_series.has_validity():
if idx_series.has_nulls():
msg = "cannot use `__getitem__` with index values containing nulls"
raise ValueError(msg)
return self._take_with_series(idx_series)
Expand Down Expand Up @@ -1398,7 +1398,7 @@ def __array__(
# Cast String types to fixed-length string to support string ufuncs
# TODO: Use variable-length strings instead when NumPy 2.0.0 comes out:
# https://numpy.org/devdocs/reference/routines.dtypes.html#numpy.dtypes.StringDType
if dtype is None and self.null_count() == 0 and self.dtype == String:
if dtype is None and not self.has_nulls() and self.dtype == String:
dtype = np.dtype("U")

if copy is None:
Expand Down Expand Up @@ -1479,7 +1479,7 @@ def __array_ufunc__(
if is_generalized_ufunc:
# Generalized ufuncs will operate on the whole array, so
# missing data can corrupt the results.
if self.null_count() > 0:
if self.has_nulls():
msg = "Can't pass a Series with missing data to a generalized ufunc, as it might give unexpected results. See https://docs.pola.rs/user-guide/expressions/missing-data/ for suggestions on how to remove or fill in missing data."
raise ComputeError(msg)
# If the input and output are the same size, e.g. "(n)->(n)" we
Expand Down Expand Up @@ -3761,10 +3761,31 @@ def null_count(self) -> int:
"""
return self._s.null_count()

def has_nulls(self) -> bool:
"""
Check whether the Series contains one or more null values.

Examples
--------
>>> s = pl.Series([1, 2, None])
>>> s.has_nulls()
True
>>> s[:2].has_nulls()
False
"""
return self.null_count() > 0

@deprecate_function(
"Use `has_nulls` instead to check for the presence of null values.",
version="0.20.30",
)
def has_validity(self) -> bool:
"""
Return True if the Series has a validity bitmask.

.. deprecated:: 0.20.30
Use :meth:`has_nulls` instead.

If there is no mask, it means that there are no `null` values.

Notes
Expand All @@ -3774,7 +3795,7 @@ def has_validity(self) -> bool:
bitmask does not mean that there are null values, as every value of the
bitmask could be `false`.

To confirm that a column has `null` values use :func:`null_count`.
To confirm that a column has `null` values use :meth:`has_nulls`.
"""
return self._s.has_validity()

Expand Down
9 changes: 6 additions & 3 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ def test_df_to_numpy_zero_copy(s: pl.Series) -> None:

@pytest.mark.parametrize(
("order", "f_contiguous", "c_contiguous"),
[("fortran", True, False), ("c", False, True)],
[
("fortran", True, False),
("c", False, True),
],
)
def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
Expand All @@ -66,12 +69,12 @@ def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) ->

# check string conversion; if no nulls can optimise as a fixed-width dtype
df = pl.DataFrame({"s": ["x", "y", None]})
assert df["s"].has_validity()
assert df["s"].has_nulls()
assert_array_equal(
df.to_numpy(structured=True),
np.array([("x",), ("y",), (None,)], dtype=[("s", "O")]),
)
assert not df["s"][:2].has_validity()
assert not df["s"][:2].has_nulls()
assert_array_equal(
df[:2].to_numpy(structured=True),
np.array([("x",), ("y",)], dtype=[("s", "<U1")]),
Expand Down
5 changes: 3 additions & 2 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ def test_view() -> None:

def test_view_nulls() -> None:
s = pl.Series("b", [1, 2, None])
assert s.has_validity()
assert s.has_nulls()
with pytest.deprecated_call(), pytest.raises(AssertionError):
s.view()

Expand All @@ -484,7 +484,8 @@ def test_view_nulls_sliced() -> None:
with pytest.deprecated_call():
view = sliced.view()
assert np.all(view == np.array([1, 2]))
assert not sliced.has_validity()
with pytest.deprecated_call():
assert not sliced.has_validity()


def test_view_ub() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/operations/map/test_map_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def test_map_elements_type_propagation() -> None:
.group_by("a", maintain_order=True)
.agg(
[
pl.when(pl.col("b").null_count() == 0)
pl.when(~pl.col("b").has_nulls())
.then(
pl.col("b").map_elements(
lambda s: s[0]["c"],
Expand Down
57 changes: 57 additions & 0 deletions py-polars/tests/unit/operations/test_has_nulls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from hypothesis import given

import polars as pl
from polars.testing import assert_frame_equal
from polars.testing.parametric import dataframes, series


@given(s=series(allow_null=False))
def test_has_nulls_series_no_nulls(s: pl.Series) -> None:
assert s.has_nulls() is False


@given(df=dataframes(allow_null=False))
def test_has_nulls_expr_no_nulls(df: pl.DataFrame) -> None:
result = df.select(pl.all().has_nulls())
assert result.select(pl.any_horizontal(df.columns)).item() is False


@given(
s=series(
excluded_dtypes=[
pl.Struct, # https://github.com/pola-rs/polars/issues/3462
]
)
)
def test_has_nulls_series_parametric(s: pl.Series) -> None:
result = s.has_nulls()
assert result == (s.null_count() > 0)
assert result == s.is_null().any()


@given(
lf=dataframes(
excluded_dtypes=[
pl.Struct, # https://github.com/pola-rs/polars/issues/3462
],
lazy=True,
)
)
def test_has_nulls_expr_parametric(lf: pl.LazyFrame) -> None:
result = lf.select(pl.all().has_nulls())

assert_frame_equal(result, lf.select(pl.all().null_count() > 0))
assert_frame_equal(result, lf.select(pl.all().is_null().any()))


def test_has_nulls_series() -> None:
s = pl.Series([1, 2, None])
assert s.has_nulls() is True
assert s[:2].has_nulls() is False


def test_has_nulls_expr() -> None:
lf = pl.LazyFrame({"a": [1, 2, None], "b": ["x", "y", "z"]})
result = lf.select(pl.all().has_nulls())
expected = pl.LazyFrame({"a": [True], "b": [False]})
assert_frame_equal(result, expected)
4 changes: 2 additions & 2 deletions py-polars/tests/unit/operations/test_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ def test_shift_and_fill() -> None:
out = ldf.with_columns(
pl.col("a").shift(n=-2, fill_value=pl.col("b").mean())
).collect()
assert out["a"].null_count() == 0
assert not out["a"].has_nulls()

# use df method
out = ldf.shift(n=2, fill_value=pl.col("b").std()).collect()
assert out["a"].null_count() == 0
assert not out["a"].has_nulls()


def test_shift_expr() -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_series_size_range(s: pl.Series) -> None:

@given(s=series(allow_null=False))
def test_series_allow_null_false(s: pl.Series) -> None:
assert s.null_count() == 0
assert not s.has_nulls()
assert s.dtype != pl.Null


Expand All @@ -71,7 +71,7 @@ def test_series_allow_null_allowed_dtypes(s: pl.Series) -> None:
@given(s=series(allowed_dtypes=[pl.List(pl.Int8)], allow_null=False))
def test_series_allow_null_nested(s: pl.Series) -> None:
for v in s:
assert v.null_count() == 0
assert not v.has_nulls()


@given(df=dataframes())
Expand Down Expand Up @@ -123,7 +123,7 @@ def test_dataframes_allow_null_column(df: pl.DataFrame) -> None:
)
)
def test_dataframes_allow_null_override(df: pl.DataFrame) -> None:
assert df.get_column("col0").null_count() == 0
assert not df.get_column("col0").has_nulls()
assert 0 <= df.get_column("colx").null_count() <= df.height


Expand Down