Skip to content

Commit

Permalink
feat(python): don't require pyarrow for converting pandas to Polars i…
Browse files Browse the repository at this point in the history
…f all columns have simple numpy-backed datatypes
  • Loading branch information
MarcoGorelli committed Apr 28, 2024
1 parent f1846a9 commit c52cc41
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 2 deletions.
24 changes: 23 additions & 1 deletion py-polars/polars/_utils/construction/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
contains_nested,
is_namedtuple,
is_pydantic_model,
is_simple_numpy_backed_pandas_series,
nt_unpack,
try_get_type_hints,
)
Expand All @@ -44,6 +45,7 @@
)
from polars.dependencies import (
_NUMPY_AVAILABLE,
_PYARROW_AVAILABLE,
_check_for_numpy,
_check_for_pandas,
dataclasses,
Expand Down Expand Up @@ -1017,10 +1019,30 @@ def pandas_to_pydf(
include_index: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a pandas DataFrame."""
convert_index = include_index and not _pandas_has_default_index(data)
if not convert_index and all(
is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
):
# Convert via NumPy directly, no PyArrow needed.
return pl.DataFrame(
{str(col): data[col].to_numpy() for col in data.columns},
schema=schema,
strict=strict,
schema_overrides=schema_overrides,
nan_to_null=nan_to_null,
)._df

if not _PYARROW_AVAILABLE:
msg = (
"pyarrow is required for converting a pandas dataframe to Polars, "
"unless each of its columns is a simple numpy-backed one "
"(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
)
raise ImportError(msg)
arrow_dict = {}
length = data.shape[0]

if include_index and not _pandas_has_default_index(data):
if convert_index:
for idxcol in data.index.names:
arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
data.index.get_level_values(idxcol),
Expand Down
11 changes: 11 additions & 0 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
get_first_non_none,
is_namedtuple,
is_pydantic_model,
is_simple_numpy_backed_pandas_series,
)
from polars._utils.various import (
find_stacklevel,
Expand Down Expand Up @@ -56,6 +57,7 @@
py_type_to_constructor,
)
from polars.dependencies import (
_PYARROW_AVAILABLE,
_check_for_numpy,
dataclasses,
)
Expand Down Expand Up @@ -408,6 +410,15 @@ def pandas_to_pyseries(
"""Construct a PySeries from a pandas Series or DatetimeIndex."""
if not name and values.name is not None:
name = str(values.name)
if is_simple_numpy_backed_pandas_series(values):
return pl.Series(name, values.to_numpy(), nan_to_null=nan_to_null)._s
if not _PYARROW_AVAILABLE:
msg = (
"pyarrow is required for converting a pandas series to Polars, "
"unless it is a simple numpy-backed one "
"(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
)
raise ImportError(msg)
return arrow_to_pyseries(
name, plc.pandas_series_to_arrow(values, nan_to_null=nan_to_null)
)
Expand Down
43 changes: 42 additions & 1 deletion py-polars/polars/_utils/construction/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

import sys
from functools import lru_cache
from typing import Any, Callable, Sequence, get_type_hints
from typing import TYPE_CHECKING, Any, Callable, Sequence, get_type_hints

from polars.dependencies import _check_for_pydantic, pydantic

if TYPE_CHECKING:
import pandas as pd


def _get_annotations(obj: type) -> dict[str, Any]:
return getattr(obj, "__annotations__", {})
Expand All @@ -26,6 +29,26 @@ def try_get_type_hints(obj: type) -> dict[str, Any]:
else:
try_get_type_hints = _get_annotations

PANDAS_SIMPLE_NUMPY_DTYPES = {
"int64",
"int32",
"int16",
"int8",
"uint64",
"uint32",
"uint16",
"uint8",
"float64",
"float32",
"datetime64[ms]",
"datetime64[us]",
"datetime64[ns]",
"timedelta64[ms]",
"timedelta64[us]",
"timedelta64[ns]",
"bool",
}


@lru_cache(64)
def is_namedtuple(cls: Any, *, annotated: bool = False) -> bool:
Expand Down Expand Up @@ -75,3 +98,21 @@ def contains_nested(value: Any, is_nested: Callable[[Any], bool]) -> bool:
elif isinstance(value, (list, tuple)):
return any(contains_nested(v, is_nested) for v in value)
return False


def is_simple_numpy_backed_pandas_series(
series: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
) -> bool:
if len(series.shape) > 1:
# Pandas Series is actually a Pandas DataFrame when the original DataFrame
# contains duplicated columns and a duplicated column is requested with df["a"].
msg = "duplicate column names found: "
raise ValueError(
msg,
f"{series.columns.tolist()!s}", # type: ignore[union-attr]
)
return (str(series.dtype) in PANDAS_SIMPLE_NUMPY_DTYPES) or (
series.dtype == "object"
and not series.empty
and isinstance(next(iter(series)), str)
)
38 changes: 38 additions & 0 deletions py-polars/tests/unit/interop/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -998,3 +998,41 @@ def test_from_avro_valid_time_zone_13032() -> None:
result = cast(pl.Series, pl.from_arrow(arr))
expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC"))
assert_series_equal(result, expected)


def test_from_pandas_pyarrow_not_available(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(
"polars._utils.construction.dataframe._PYARROW_AVAILABLE", False
)
monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False)
data: dict[str, Any] = {
"a": [1, 2],
"b": ["one", "two"],
"c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"),
"d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"),
"e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"),
"f": np.array([1, 2], dtype="timedelta64[ns]"),
"g": np.array([1, 2], dtype="timedelta64[us]"),
"h": np.array([1, 2], dtype="timedelta64[ms]"),
"i": [True, False],
}
result = pl.from_pandas(pd.DataFrame(data))
expected = pl.DataFrame(data)
assert_frame_equal(result, expected)
for col in data:
s_pd = pd.Series(data[col])
result_s = pl.from_pandas(s_pd)
expected_s = pl.Series(data[col])
assert_series_equal(result_s, expected_s)
with pytest.raises(ImportError, match="pyarrow is required"):
pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64"))
with pytest.raises(ImportError, match="pyarrow is required"):
pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64"))
with pytest.raises(ImportError, match="pyarrow is required"):
pl.from_pandas(
pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()})
)
with pytest.raises(ImportError, match="pyarrow is required"):
pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))

0 comments on commit c52cc41

Please sign in to comment.