Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(python): Respect dtype and strict in pl.Series's constructor for pyarrow arrays, numpy arrays, and pyarrow-backed pandas #15962

Merged
merged 15 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ def pandas_to_pyseries(
values: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
dtype: PolarsDataType | None = None,
*,
strict: bool = True,
luke396 marked this conversation as resolved.
Show resolved Hide resolved
nan_to_null: bool = True,
) -> PySeries:
"""Construct a PySeries from a pandas Series or DatetimeIndex."""
Expand All @@ -423,11 +424,21 @@ def pandas_to_pyseries(
)
raise ImportError(msg)
return arrow_to_pyseries(
name, plc.pandas_series_to_arrow(values, nan_to_null=nan_to_null)
name,
plc.pandas_series_to_arrow(values, nan_to_null=nan_to_null),
dtype=dtype,
strict=strict,
)


def arrow_to_pyseries(name: str, values: pa.Array, *, rechunk: bool = True) -> PySeries:
def arrow_to_pyseries(
name: str,
values: pa.Array,
dtype: PolarsDataType | None = None,
*,
strict: bool = True,
rechunk: bool = True,
) -> PySeries:
"""Construct a PySeries from an Arrow array."""
array = plc.coerce_arrow(values)

Expand Down Expand Up @@ -464,7 +475,7 @@ def arrow_to_pyseries(name: str, values: pa.Array, *, rechunk: bool = True) -> P
if rechunk:
pys.rechunk(in_place=True)

return pys
return pys.cast(dtype, strict=strict) if dtype is not None else pys


def numpy_to_pyseries(
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,12 @@ def __init__(
elif _check_for_pyarrow(values) and isinstance(
values, (pa.Array, pa.ChunkedArray)
):
self._s = arrow_to_pyseries(name, values)
self._s = arrow_to_pyseries(name, values, dtype=dtype, strict=strict)

elif _check_for_pandas(values) and isinstance(
values, (pd.Series, pd.Index, pd.DatetimeIndex)
):
self._s = pandas_to_pyseries(name, values, dtype=dtype)
self._s = pandas_to_pyseries(name, values, dtype=dtype, strict=strict)
luke396 marked this conversation as resolved.
Show resolved Hide resolved

elif _is_generator(values):
self._s = iterable_to_pyseries(name, values, dtype=dtype, strict=strict)
Expand Down
27 changes: 26 additions & 1 deletion py-polars/tests/unit/series/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2340,5 +2340,30 @@ def test_search_sorted(


def test_series_from_pandas_with_dtype() -> None:
expected = pl.Series("foo", [1, 2, 3], dtype=pl.Float32)
s = pl.Series("foo", pd.Series([1, 2, 3]), pl.Float32)
assert_series_equal(s, pl.Series("foo", [1, 2, 3], dtype=pl.Float32))
assert_series_equal(s, expected)
s = pl.Series("foo", pd.Series([1, 2, 3]), pl.Float32, strict=False)
assert_series_equal(s, expected)
s = pl.Series("foo", pd.Series([1, 2, 3], dtype="Int64"), pl.Float32)
assert_series_equal(s, expected)
s = pl.Series("foo", pd.Series([1, 2, 3], dtype="Int64"), pl.Float32, strict=False)
assert_series_equal(s, expected)


def test_series_from_pyarrow_with_dtype() -> None:
expected = pl.Series("foo", [1, 2, 3], dtype=pl.Float32)
s = pl.Series("foo", pa.array([1, 2, 3]), pl.Float32)
assert_series_equal(s, expected)
s = pl.Series("foo", pa.array([1, 2, 3]), dtype=pl.Float32, strict=False)
assert_series_equal(s, expected)
luke396 marked this conversation as resolved.
Show resolved Hide resolved


def test_series_from_pandas_with_strict() -> None:
s = pl.Series(pd.Series([1, 2.5, 3]), strict=False)
assert_series_equal(s, pl.Series([1.0, 2.5, 3.0], dtype=pl.Float64))


def test_series_from_pyarrow_with_strict() -> None:
s = pl.Series(pa.array([1, 2.5, 3]), strict=False)
assert_series_equal(s, pl.Series([1.0, 2.5, 3.0], dtype=pl.Float64))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be missing something, but I don't see what these tests add to what you already have

Rather than adding these, how about testing converting from a numpy array with [-1, 2, 3] to a Polars Series with dtype pl.UInt8, with strict=False and strict=True? I think this will cover the behaviour change you mentioned in https://github.com/pola-rs/polars/pull/15962/files#r1590912969

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two tests added earlier were because I felt there needed to be a simple test with strict=False without any other parameters. However, I think you are right, these two tests are covered by others.

The behavior change mentioned in #15962 (comment) has been covered by the test using strict=False like below. It was precisely because of your previous suggestion to modify the test that I discovered this potential problem.

s = pl.Series("foo", pd.Series([-1, 2, 3], dtype="Int8"), pl.UInt8, strict=False)
assert s.to_list() == [None, 2, 3]
assert s.dtype == pl.UInt8

Nevertheless, I will add a test with a numpy array and the strict parameter.

Loading