pola-rs · ritchie46 · May 26, 2024 · May 25, 2024 · May 25, 2024 · May 25, 2024
@@ -8,6 +8,7 @@ Boolean
 
     Expr.all
     Expr.any
+    Expr.has_nulls
     Expr.is_between
     Expr.is_duplicated
     Expr.is_finite

@@ -9,6 +9,7 @@ Descriptive
     Series.chunk_lengths
     Series.describe
     Series.estimated_size
+    Series.has_nulls
     Series.has_validity
     Series.is_boolean
     Series.is_duplicated

@@ -1657,7 +1657,7 @@ def to_numpy(
                 else:
                     arr = s.to_numpy(use_pyarrow=use_pyarrow)
 
-                if s.dtype == String and s.null_count() == 0:
+                if s.dtype == String and not s.has_nulls():
                     arr = arr.astype(str, copy=False)
                 arrays.append(arr)
                 struct_dtype.append((s.name, arr.dtype))

@@ -599,7 +599,7 @@ def __init__(self, categories: Series | Iterable[str]):
             self.categories = pl.Series(name="category", dtype=String)
             return
 
-        if categories.null_count() > 0:
+        if categories.has_nulls():
             msg = "Enum categories must not contain null values"
             raise TypeError(msg)
 

@@ -3462,6 +3462,31 @@ def null_count(self) -> Self:
         """
         return self._from_pyexpr(self._pyexpr.null_count())
 
+    def has_nulls(self) -> Expr:
+        """
+        Check whether the expression contains one or more null values.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "a": [None, 1, None],
+        ...         "b": [10, None, 300],
+        ...         "c": [350, 650, 850],
+        ...     }
+        ... )
+        >>> df.select(pl.all().has_nulls())
+        shape: (1, 3)
+        ┌──────┬──────┬───────┐
+        │ a    ┆ b    ┆ c     │
+        │ ---  ┆ ---  ┆ ---   │
+        │ bool ┆ bool ┆ bool  │
+        ╞══════╪══════╪═══════╡
+        │ true ┆ true ┆ false │
+        └──────┴──────┴───────┘
+        """
+        return self.null_count() > 0
+
     def arg_unique(self) -> Self:
         """
         Get index of first unique value.

@@ -1215,7 +1215,7 @@ def __deepcopy__(self, memo: None = None) -> Self:
 
     def __contains__(self, item: Any) -> bool:
         if item is None:
-            return self.null_count() > 0
+            return self.has_nulls()
         return self.implode().list.contains(item).item()
 
     def __iter__(self) -> Generator[Any, None, None]:
@@ -1325,7 +1325,7 @@ def __getitem__(
             not item or (isinstance(item[0], int) and not isinstance(item[0], bool))  # type: ignore[redundant-expr]
         ):
             idx_series = Series("", item, dtype=Int64)._pos_idxs(self.len())
-            if idx_series.has_validity():
+            if idx_series.has_nulls():
                 msg = "cannot use `__getitem__` with index values containing nulls"
                 raise ValueError(msg)
             return self._take_with_series(idx_series)
@@ -1398,7 +1398,7 @@ def __array__(
         # Cast String types to fixed-length string to support string ufuncs
         # TODO: Use variable-length strings instead when NumPy 2.0.0 comes out:
         # https://numpy.org/devdocs/reference/routines.dtypes.html#numpy.dtypes.StringDType
-        if dtype is None and self.null_count() == 0 and self.dtype == String:
+        if dtype is None and not self.has_nulls() and self.dtype == String:
             dtype = np.dtype("U")
 
         if copy is None:
@@ -1479,7 +1479,7 @@ def __array_ufunc__(
             if is_generalized_ufunc:
                 # Generalized ufuncs will operate on the whole array, so
                 # missing data can corrupt the results.
-                if self.null_count() > 0:
+                if self.has_nulls():
                     msg = "Can't pass a Series with missing data to a generalized ufunc, as it might give unexpected results. See https://docs.pola.rs/user-guide/expressions/missing-data/ for suggestions on how to remove or fill in missing data."
                     raise ComputeError(msg)
                 # If the input and output are the same size, e.g. "(n)->(n)" we
@@ -3761,10 +3761,31 @@ def null_count(self) -> int:
         """
         return self._s.null_count()
 
+    def has_nulls(self) -> bool:
+        """
+        Check whether the Series contains one or more null values.
+
+        Examples
+        --------
+        >>> s = pl.Series([1, 2, None])
+        >>> s.has_nulls()
+        True
+        >>> s[:2].has_nulls()
+        False
+        """
+        return self.null_count() > 0
+
+    @deprecate_function(
+        "Use `has_nulls` instead to check for the presence of null values.",
+        version="0.20.30",
+    )
     def has_validity(self) -> bool:
         """
         Return True if the Series has a validity bitmask.
 
+        .. deprecated:: 0.20.30
+            Use :meth:`has_nulls` instead.
+
         If there is no mask, it means that there are no `null` values.
 
         Notes
@@ -3774,7 +3795,7 @@ def has_validity(self) -> bool:
         bitmask does not mean that there are null values, as every value of the
         bitmask could be `false`.
 
-        To confirm that a column has `null` values use :func:`null_count`.
+        To confirm that a column has `null` values use :meth:`has_nulls`.
         """
         return self._s.has_validity()
 

@@ -46,7 +46,10 @@ def test_df_to_numpy_zero_copy(s: pl.Series) -> None:
 
 @pytest.mark.parametrize(
     ("order", "f_contiguous", "c_contiguous"),
-    [("fortran", True, False), ("c", False, True)],
+    [
+        ("fortran", True, False),
+        ("c", False, True),
+    ],
 )
 def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
@@ -66,12 +69,12 @@ def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) ->
 
     # check string conversion; if no nulls can optimise as a fixed-width dtype
     df = pl.DataFrame({"s": ["x", "y", None]})
-    assert df["s"].has_validity()
+    assert df["s"].has_nulls()
     assert_array_equal(
         df.to_numpy(structured=True),
         np.array([("x",), ("y",), (None,)], dtype=[("s", "O")]),
     )
-    assert not df["s"][:2].has_validity()
+    assert not df["s"][:2].has_nulls()
     assert_array_equal(
         df[:2].to_numpy(structured=True),
         np.array([("x",), ("y",)], dtype=[("s", "<U1")]),

@@ -473,7 +473,7 @@ def test_view() -> None:
 
 def test_view_nulls() -> None:
     s = pl.Series("b", [1, 2, None])
-    assert s.has_validity()
+    assert s.has_nulls()
     with pytest.deprecated_call(), pytest.raises(AssertionError):
         s.view()
 
@@ -484,7 +484,8 @@ def test_view_nulls_sliced() -> None:
     with pytest.deprecated_call():
         view = sliced.view()
     assert np.all(view == np.array([1, 2]))
-    assert not sliced.has_validity()
+    with pytest.deprecated_call():
+        assert not sliced.has_validity()
 
 
 def test_view_ub() -> None:

@@ -151,7 +151,7 @@ def test_map_elements_type_propagation() -> None:
         .group_by("a", maintain_order=True)
         .agg(
             [
-                pl.when(pl.col("b").null_count() == 0)
+                pl.when(~pl.col("b").has_nulls())
                 .then(
                     pl.col("b").map_elements(
                         lambda s: s[0]["c"],

@@ -0,0 +1,57 @@
+from hypothesis import given
+
+import polars as pl
+from polars.testing import assert_frame_equal
+from polars.testing.parametric import dataframes, series
+
+
+@given(s=series(allow_null=False))
+def test_has_nulls_series_no_nulls(s: pl.Series) -> None:
+    assert s.has_nulls() is False
+
+
+@given(df=dataframes(allow_null=False))
+def test_has_nulls_expr_no_nulls(df: pl.DataFrame) -> None:
+    result = df.select(pl.all().has_nulls())
+    assert result.select(pl.any_horizontal(df.columns)).item() is False
+
+
+@given(
+    s=series(
+        excluded_dtypes=[
+            pl.Struct,  # https://github.com/pola-rs/polars/issues/3462
+        ]
+    )
+)
+def test_has_nulls_series_parametric(s: pl.Series) -> None:
+    result = s.has_nulls()
+    assert result == (s.null_count() > 0)
+    assert result == s.is_null().any()
+
+
+@given(
+    lf=dataframes(
+        excluded_dtypes=[
+            pl.Struct,  # https://github.com/pola-rs/polars/issues/3462
+        ],
+        lazy=True,
+    )
+)
+def test_has_nulls_expr_parametric(lf: pl.LazyFrame) -> None:
+    result = lf.select(pl.all().has_nulls())
+
+    assert_frame_equal(result, lf.select(pl.all().null_count() > 0))
+    assert_frame_equal(result, lf.select(pl.all().is_null().any()))
+
+
+def test_has_nulls_series() -> None:
+    s = pl.Series([1, 2, None])
+    assert s.has_nulls() is True
+    assert s[:2].has_nulls() is False
+
+
+def test_has_nulls_expr() -> None:
+    lf = pl.LazyFrame({"a": [1, 2, None], "b": ["x", "y", "z"]})
+    result = lf.select(pl.all().has_nulls())
+    expected = pl.LazyFrame({"a": [True], "b": [False]})
+    assert_frame_equal(result, expected)
@@ -47,11 +47,11 @@ def test_shift_and_fill() -> None:
     out = ldf.with_columns(
         pl.col("a").shift(n=-2, fill_value=pl.col("b").mean())
     ).collect()
-    assert out["a"].null_count() == 0
+    assert not out["a"].has_nulls()
 
     # use df method
     out = ldf.shift(n=2, fill_value=pl.col("b").std()).collect()
-    assert out["a"].null_count() == 0
+    assert not out["a"].has_nulls()
 
 
 def test_shift_expr() -> None:

@@ -59,7 +59,7 @@ def test_series_size_range(s: pl.Series) -> None:
 
 @given(s=series(allow_null=False))
 def test_series_allow_null_false(s: pl.Series) -> None:
-    assert s.null_count() == 0
+    assert not s.has_nulls()
     assert s.dtype != pl.Null
 
 
@@ -71,7 +71,7 @@ def test_series_allow_null_allowed_dtypes(s: pl.Series) -> None:
 @given(s=series(allowed_dtypes=[pl.List(pl.Int8)], allow_null=False))
 def test_series_allow_null_nested(s: pl.Series) -> None:
     for v in s:
-        assert v.null_count() == 0
+        assert not v.has_nulls()
 
 
 @given(df=dataframes())
@@ -123,7 +123,7 @@ def test_dataframes_allow_null_column(df: pl.DataFrame) -> None:
     )
 )
 def test_dataframes_allow_null_override(df: pl.DataFrame) -> None:
-    assert df.get_column("col0").null_count() == 0
+    assert not df.get_column("col0").has_nulls()
     assert 0 <= df.get_column("colx").null_count() <= df.height