Skip to content

Commit

Permalink
fix(python): Fix Series constructor failure for Array types for large…
Browse files Browse the repository at this point in the history
… integers (#16050)
  • Loading branch information
stinodego authored May 5, 2024
1 parent e17ef46 commit b57f8ea
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 141 deletions.
278 changes: 137 additions & 141 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from polars.datatypes import (
INTEGER_DTYPES,
TEMPORAL_DTYPES,
Array,
Boolean,
Categorical,
Date,
Expand Down Expand Up @@ -96,7 +97,7 @@ def sequence_to_pyseries(
dtype = Null

# lists defer to subsequent handling; identify nested type
elif dtype == List:
elif dtype in (List, Array):
python_dtype = list

# infer temporal type handling
Expand Down Expand Up @@ -130,8 +131,9 @@ def sequence_to_pyseries(
# flat data
if (
dtype is not None
and dtype not in (List, Struct, Unknown)
and is_polars_dtype(dtype)
and not dtype.is_nested()
and dtype != Unknown
and (python_dtype is None)
):
constructor = polars_type_to_constructor(dtype)
Expand Down Expand Up @@ -160,159 +162,153 @@ def sequence_to_pyseries(
schema=struct_schema,
orient="row",
).to_struct(name)
else:
if python_dtype is None:
if value is None:
constructor = polars_type_to_constructor(Null)
return constructor(name, values, strict)

# generic default dtype
python_dtype = type(value)

# temporal branch
if python_dtype in py_temporal_types:
if dtype is None:
dtype = py_type_to_dtype(python_dtype) # construct from integer
elif dtype in py_temporal_types:
dtype = py_type_to_dtype(dtype)

values_dtype = (
None
if value is None
else py_type_to_dtype(type(value), raise_unmatched=False)

if python_dtype is None:
if value is None:
constructor = polars_type_to_constructor(Null)
return constructor(name, values, strict)

# generic default dtype
python_dtype = type(value)

# temporal branch
if python_dtype in py_temporal_types:
if dtype is None:
dtype = py_type_to_dtype(python_dtype) # construct from integer
elif dtype in py_temporal_types:
dtype = py_type_to_dtype(dtype)

values_dtype = (
None
if value is None
else py_type_to_dtype(type(value), raise_unmatched=False)
)
if values_dtype is not None and values_dtype.is_float():
msg = f"'float' object cannot be interpreted as a {python_dtype.__name__!r}"
raise TypeError(
# we do not accept float values as temporal; if this is
# required, the caller should explicitly cast to int first.
msg
)
if values_dtype is not None and values_dtype.is_float():
msg = f"'float' object cannot be interpreted as a {python_dtype.__name__!r}"
raise TypeError(
# we do not accept float values as temporal; if this is
# required, the caller should explicitly cast to int first.
msg
)

# We use the AnyValue builder to create the datetime array
# We store the values internally as UTC and set the timezone
py_series = PySeries.new_from_any_values(name, values, strict)
# We use the AnyValue builder to create the datetime array
# We store the values internally as UTC and set the timezone
py_series = PySeries.new_from_any_values(name, values, strict)

time_unit = getattr(dtype, "time_unit", None)
time_zone = getattr(dtype, "time_zone", None)
time_unit = getattr(dtype, "time_unit", None)
time_zone = getattr(dtype, "time_zone", None)

if time_unit is None or values_dtype == Date:
s = wrap_s(py_series)
else:
s = wrap_s(py_series).dt.cast_time_unit(time_unit)
if time_unit is None or values_dtype == Date:
s = wrap_s(py_series)
else:
s = wrap_s(py_series).dt.cast_time_unit(time_unit)

if (values_dtype == Date) & (dtype == Datetime):
return (
s.cast(Datetime(time_unit or "us"))
.dt.replace_time_zone(time_zone)
._s
if (values_dtype == Date) & (dtype == Datetime):
return (
s.cast(Datetime(time_unit or "us")).dt.replace_time_zone(time_zone)._s
)

if (dtype == Datetime) and (value.tzinfo is not None or time_zone is not None):
values_tz = str(value.tzinfo) if value.tzinfo is not None else None
dtype_tz = dtype.time_zone # type: ignore[union-attr]
if values_tz is not None and (dtype_tz is not None and dtype_tz != "UTC"):
msg = (
"time-zone-aware datetimes are converted to UTC"
"\n\nPlease either drop the time zone from the dtype, or set it to 'UTC'."
" To convert to a different time zone, please use `.dt.convert_time_zone`."
)
raise ValueError(msg)
if values_tz != "UTC" and dtype_tz is None:
warnings.warn(
"Constructing a Series with time-zone-aware "
"datetimes results in a Series with UTC time zone. "
"To silence this warning, you can filter "
"warnings of class TimeZoneAwareConstructorWarning, or "
"set 'UTC' as the time zone of your datatype.",
TimeZoneAwareConstructorWarning,
stacklevel=find_stacklevel(),
)
return s.dt.replace_time_zone(dtype_tz or "UTC")._s
return s._s

if (dtype == Datetime) and (
value.tzinfo is not None or time_zone is not None
):
values_tz = str(value.tzinfo) if value.tzinfo is not None else None
dtype_tz = dtype.time_zone # type: ignore[union-attr]
if values_tz is not None and (
dtype_tz is not None and dtype_tz != "UTC"
):
msg = (
"time-zone-aware datetimes are converted to UTC"
"\n\nPlease either drop the time zone from the dtype, or set it to 'UTC'."
" To convert to a different time zone, please use `.dt.convert_time_zone`."
)
raise ValueError(msg)
if values_tz != "UTC" and dtype_tz is None:
warnings.warn(
"Constructing a Series with time-zone-aware "
"datetimes results in a Series with UTC time zone. "
"To silence this warning, you can filter "
"warnings of class TimeZoneAwareConstructorWarning, or "
"set 'UTC' as the time zone of your datatype.",
TimeZoneAwareConstructorWarning,
stacklevel=find_stacklevel(),
)
return s.dt.replace_time_zone(dtype_tz or "UTC")._s
return s._s
elif (
_check_for_numpy(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
n_elems = len(value)
if all(len(v) == n_elems for v in values):
# can take (much) faster path if all lists are the same length
return numpy_to_pyseries(
name,
np.vstack(values),
strict=strict,
nan_to_null=nan_to_null,
)
else:
return PySeries.new_series_list(
name,
[
numpy_to_pyseries("", v, strict=strict, nan_to_null=nan_to_null)
for v in values
],
strict,
)

elif (
_check_for_numpy(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
n_elems = len(value)
if all(len(v) == n_elems for v in values):
# can take (much) faster path if all lists are the same length
return numpy_to_pyseries(
name,
np.vstack(values),
strict=strict,
nan_to_null=nan_to_null,
)
elif python_dtype in (list, tuple):
if dtype is None:
return PySeries.new_from_any_values(name, values, strict=strict)
elif dtype == Object:
return PySeries.new_object(name, values, strict)
else:
if (inner_dtype := getattr(dtype, "inner", None)) is not None:
pyseries_list = [
None
if value is None
else sequence_to_pyseries(
"",
value,
inner_dtype,
strict=strict,
nan_to_null=nan_to_null,
)
for value in values
]
pyseries = PySeries.new_series_list(name, pyseries_list, strict)
else:
return PySeries.new_series_list(
name,
[
numpy_to_pyseries("", v, strict=strict, nan_to_null=nan_to_null)
for v in values
],
strict,
pyseries = PySeries.new_from_any_values_and_dtype(
name, values, dtype, strict=strict
)
if dtype != pyseries.dtype():
pyseries = pyseries.cast(dtype, strict=False)
return pyseries

elif python_dtype in (list, tuple):
if dtype is None:
return PySeries.new_from_any_values(name, values, strict=strict)
elif dtype == Object:
return PySeries.new_object(name, values, strict)
else:
if (inner_dtype := getattr(dtype, "inner", None)) is not None:
pyseries_list = [
None
if value is None
else sequence_to_pyseries(
"",
value,
inner_dtype,
strict=strict,
nan_to_null=nan_to_null,
)
for value in values
]
pyseries = PySeries.new_series_list(name, pyseries_list, strict)
elif python_dtype == pl.Series:
return PySeries.new_series_list(
name, [v._s if v is not None else None for v in values], strict
)

elif python_dtype == PySeries:
return PySeries.new_series_list(name, values, strict)
else:
constructor = py_type_to_constructor(python_dtype)
if constructor == PySeries.new_object:
try:
srs = PySeries.new_from_any_values(name, values, strict)
if _check_for_numpy(python_dtype, check_type=False) and isinstance(
np.bool_(True), np.generic
):
dtype = numpy_char_code_to_dtype(np.dtype(python_dtype).char)
return srs.cast(dtype, strict=strict)
else:
pyseries = PySeries.new_from_any_values_and_dtype(
name, values, dtype, strict=strict
)
if dtype != pyseries.dtype():
pyseries = pyseries.cast(dtype, strict=False)
return pyseries
return srs

elif python_dtype == pl.Series:
return PySeries.new_series_list(
name, [v._s if v is not None else None for v in values], strict
)
except RuntimeError:
return PySeries.new_from_any_values(name, values, strict=strict)

elif python_dtype == PySeries:
return PySeries.new_series_list(name, values, strict)
else:
constructor = py_type_to_constructor(python_dtype)
if constructor == PySeries.new_object:
try:
srs = PySeries.new_from_any_values(name, values, strict)
if _check_for_numpy(python_dtype, check_type=False) and isinstance(
np.bool_(True), np.generic
):
dtype = numpy_char_code_to_dtype(np.dtype(python_dtype).char)
return srs.cast(dtype, strict=strict)
else:
return srs

except RuntimeError:
return PySeries.new_from_any_values(name, values, strict=strict)

return _construct_series_with_fallbacks(
constructor, name, values, dtype, strict=strict
)
return _construct_series_with_fallbacks(
constructor, name, values, dtype, strict=strict
)


def _construct_series_with_fallbacks(
Expand Down
9 changes: 9 additions & 0 deletions py-polars/tests/unit/constructors/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@ def test_large_timedelta(dtype: pl.DataType | None) -> None:
# Microsecond precision is lost
expected = [timedelta.min, timedelta.max - timedelta(microseconds=999)]
assert s.to_list() == expected


def test_array_large_u64() -> None:
u64_max = 2**64 - 1
values = [[u64_max]]
dtype = pl.Array(pl.UInt64, 1)
s = pl.Series(values, dtype=dtype)
assert s.dtype == dtype
assert s.to_list() == values

0 comments on commit b57f8ea

Please sign in to comment.