Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for polars dataframes and series #7463

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing import cast

import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
import pytensor
import pytensor.tensor as pt
import xarray as xr
Expand Down Expand Up @@ -204,7 +204,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size:

def determine_coords(
model,
value: pd.DataFrame | pd.Series | xr.DataArray,
value: ArrayLike,
dims: Sequence[str | None] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None]]:
Expand Down Expand Up @@ -348,7 +348,7 @@ def Data(
----------
name : str
The name for this variable.
value : array_like or pandas.Series, pandas.Dataframe
value : array_like or pandas.Series, pandas.Dataframe, polars.DataFrame, polars.Series, xarray.DataArray
A value to associate with this variable.
dims : str or tuple of str, optional
Dimension names of the random variables (as opposed to the shapes of these
Expand Down
33 changes: 28 additions & 5 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

import numpy as np
import pandas as pd
try:
import polars as pl
except ImportError:
pl = None
import pytensor
import pytensor.tensor as pt
import scipy.sparse as sps
Expand Down Expand Up @@ -111,6 +115,18 @@
ret = np.ma.MaskedArray(vals, mask)
else:
ret = vals
elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
# Probably polars object

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not a bit more explicit:

Suggested change
elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
elif pl is not None and isinstance(data, (pl.DataFrame, pl.Series)):

The polars namespace is used anyway (in the except clause).

vals = data.to_numpy()
try:
null_data = data.is_null()
except AttributeError:
null_data = data.with_columns(pl.all().is_null())
mask = null_data.to_numpy()
if mask.any():

Check warning on line 125 in pymc/pytensorf.py

View check run for this annotation

Codecov / codecov/patch

pymc/pytensorf.py#L119-L125

Added lines #L119 - L125 were not covered by tests
# there are missing values
ret = np.ma.MaskedArray(vals, mask)

Check warning on line 127 in pymc/pytensorf.py

View check run for this annotation

Codecov / codecov/patch

pymc/pytensorf.py#L127

Added line #L127 was not covered by tests
else:
ret = vals

Check warning on line 129 in pymc/pytensorf.py

View check run for this annotation

Codecov / codecov/patch

pymc/pytensorf.py#L129

Added line #L129 was not covered by tests
elif isinstance(data, np.ndarray):
if isinstance(data, np.ma.MaskedArray):
if not data.mask.any():
Expand Down Expand Up @@ -141,11 +157,18 @@
# Otherwise we only convert the precision.
return smarttypeX(ret)


@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
if pl is not None:
@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
@_as_tensor_variable.register(pl.DataFrame)
@_as_tensor_variable.register(pl.Series)
def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)

Check warning on line 166 in pymc/pytensorf.py

View check run for this annotation

Codecov / codecov/patch

pymc/pytensorf.py#L161-L166

Added lines #L161 - L166 were not covered by tests
else:
@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
Comment on lines +160 to +171
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more succinct. Also type hint of df was wrong, so I just removed it.

Suggested change
if pl is not None:
@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
@_as_tensor_variable.register(pl.DataFrame)
@_as_tensor_variable.register(pl.Series)
def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
else:
@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
def dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
if pl is not None:
@_as_tensor_variable.register(pl.DataFrame)
@_as_tensor_variable.register(pl.Series)
def polars_dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)



def extract_obs_data(x: TensorVariable) -> np.ndarray:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,21 @@ def test_implicit_coords_dataframe(self, seeded_test):
assert "columns" in pmodel.coords
assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")}

def test_implicit_coords_polars_dataframe(self, seeded_test):
pl = pytest.importorskip("polars")
N_rows = 5
N_cols = 7
df_data = pl.DataFrame({f"Column {c+1}": np.random.normal(size=(N_rows,)) for c in range(N_cols)})
df_data = df_data.with_row_count("rows")

# infer coordinates from index and columns of the DataFrame
with pm.Model() as pmodel:
pm.Data("observations", df_data, dims=("rows", "columns"), infer_dims_and_coords=True)

assert "rows" in pmodel.coords
assert "columns" in pmodel.coords
assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")}

def test_implicit_coords_xarray(self):
xr = pytest.importorskip("xarray")
data = xr.DataArray([[1, 2, 3], [4, 5, 6]], dims=("y", "x"))
Expand Down
13 changes: 13 additions & 0 deletions tests/test_pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
df = pd.DataFrame(np_array)
np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)

@pytest.mark.parametrize(
argnames="np_array",
argvalues=[
np.array([[1.0], [2.0], [-1.0]]),
np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]),
np.ones(shape=(10, 1)),
],
)
def test_polars_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
pl = pytest.importorskip("polars")
df = pl.DataFrame(np_array)
np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)


@pytest.mark.parametrize(
argnames="np_array",
Expand Down
Loading