diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 4e231e7a50284..c20a60c3162ef 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -23,6 +23,7 @@ contains_nested, is_namedtuple, is_pydantic_model, + is_simple_numpy_backed_pandas_series, nt_unpack, try_get_type_hints, ) @@ -44,6 +45,7 @@ ) from polars.dependencies import ( _NUMPY_AVAILABLE, + _PYARROW_AVAILABLE, _check_for_numpy, _check_for_pandas, dataclasses, @@ -1017,10 +1019,30 @@ def pandas_to_pydf( include_index: bool = False, ) -> PyDataFrame: """Construct a PyDataFrame from a pandas DataFrame.""" + convert_index = include_index and not _pandas_has_default_index(data) + if not convert_index and all( + is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns + ): + # Convert via NumPy directly, no PyArrow needed. + return pl.DataFrame( + {str(col): data[col].to_numpy() for col in data.columns}, + schema=schema, + strict=strict, + schema_overrides=schema_overrides, + nan_to_null=nan_to_null, + )._df + + if not _PYARROW_AVAILABLE: + msg = ( + "pyarrow is required for converting a pandas dataframe to Polars, " + "unless each of its columns is a simple numpy-backed one " + "(e.g. 'int64', 'bool', 'float32' - not 'Int64')" + ) + raise ImportError(msg) arrow_dict = {} length = data.shape[0] - if include_index and not _pandas_has_default_index(data): + if convert_index: for idxcol in data.index.names: arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow( data.index.get_level_values(idxcol), diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py index e112a169e7286..4a4d1d2b41c09 100644 --- a/py-polars/polars/_utils/construction/series.py +++ b/py-polars/polars/_utils/construction/series.py @@ -22,6 +22,7 @@ get_first_non_none, is_namedtuple, is_pydantic_model, + is_simple_numpy_backed_pandas_series, ) from polars._utils.various import ( find_stacklevel, @@ -56,6 +57,7 @@ py_type_to_constructor, ) from polars.dependencies import ( + _PYARROW_AVAILABLE, _check_for_numpy, dataclasses, ) @@ -408,6 +410,15 @@ def pandas_to_pyseries( """Construct a PySeries from a pandas Series or DatetimeIndex.""" if not name and values.name is not None: name = str(values.name) + if is_simple_numpy_backed_pandas_series(values): + return pl.Series(name, values.to_numpy(), nan_to_null=nan_to_null)._s + if not _PYARROW_AVAILABLE: + msg = ( + "pyarrow is required for converting a pandas series to Polars, " + "unless it is a simple numpy-backed one " + "(e.g. 'int64', 'bool', 'float32' - not 'Int64')" + ) + raise ImportError(msg) return arrow_to_pyseries( name, plc.pandas_series_to_arrow(values, nan_to_null=nan_to_null) ) diff --git a/py-polars/polars/_utils/construction/utils.py b/py-polars/polars/_utils/construction/utils.py index dbfc67933273d..dc475908d135b 100644 --- a/py-polars/polars/_utils/construction/utils.py +++ b/py-polars/polars/_utils/construction/utils.py @@ -2,10 +2,13 @@ import sys from functools import lru_cache -from typing import Any, Callable, Sequence, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, Sequence, get_type_hints from polars.dependencies import _check_for_pydantic, pydantic +if TYPE_CHECKING: + import pandas as pd + def _get_annotations(obj: type) -> dict[str, Any]: return getattr(obj, "__annotations__", {}) @@ -26,6 +29,26 @@ def try_get_type_hints(obj: type) -> dict[str, Any]: else: try_get_type_hints = _get_annotations +PANDAS_SIMPLE_NUMPY_DTYPES = { + "int64", + "int32", + "int16", + "int8", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + "bool", +} + @lru_cache(64) def is_namedtuple(cls: Any, *, annotated: bool = False) -> bool: @@ -75,3 +98,21 @@ def contains_nested(value: Any, is_nested: Callable[[Any], bool]) -> bool: elif isinstance(value, (list, tuple)): return any(contains_nested(v, is_nested) for v in value) return False + + +def is_simple_numpy_backed_pandas_series( + series: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex, +) -> bool: + if len(series.shape) > 1: + # Pandas Series is actually a Pandas DataFrame when the original DataFrame + # contains duplicated columns and a duplicated column is requested with df["a"]. + msg = "duplicate column names found: " + raise ValueError( + msg, + f"{series.columns.tolist()!s}", # type: ignore[union-attr] + ) + return (str(series.dtype) in PANDAS_SIMPLE_NUMPY_DTYPES) or ( + series.dtype == "object" + and not series.empty + and isinstance(next(iter(series)), str) + ) diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index 630530f457be8..5683771d06e70 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -998,3 +998,41 @@ def test_from_avro_valid_time_zone_13032() -> None: result = cast(pl.Series, pl.from_arrow(arr)) expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC")) assert_series_equal(result, expected) + + +def test_from_pandas_pyarrow_not_available( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "polars._utils.construction.dataframe._PYARROW_AVAILABLE", False + ) + monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False) + data: dict[str, Any] = { + "a": [1, 2], + "b": ["one", "two"], + "c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"), + "d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"), + "e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"), + "f": np.array([1, 2], dtype="timedelta64[ns]"), + "g": np.array([1, 2], dtype="timedelta64[us]"), + "h": np.array([1, 2], dtype="timedelta64[ms]"), + "i": [True, False], + } + result = pl.from_pandas(pd.DataFrame(data)) + expected = pl.DataFrame(data) + assert_frame_equal(result, expected) + for col in data: + s_pd = pd.Series(data[col]) + result_s = pl.from_pandas(s_pd) + expected_s = pl.Series(data[col]) + assert_series_equal(result_s, expected_s) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64")) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64")) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas( + pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()}) + ) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))