pymc-devs · fonnesbeck · Aug 15, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/pymc/data.py b/pymc/data.py
@@ -21,7 +21,7 @@
 from typing import cast
 
 import numpy as np
-import pandas as pd
+from numpy.typing import ArrayLike
 import pytensor
 import pytensor.tensor as pt
 import xarray as xr
@@ -204,7 +204,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size:
 
 def determine_coords(
     model,
-    value: pd.DataFrame | pd.Series | xr.DataArray,
+    value: ArrayLike,
     dims: Sequence[str | None] | None = None,
     coords: dict[str, Sequence | np.ndarray] | None = None,
 ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None]]:
@@ -348,7 +348,7 @@ def Data(
     ----------
     name : str
         The name for this variable.
-    value : array_like or pandas.Series, pandas.Dataframe
+    value : array_like or pandas.Series, pandas.Dataframe, polars.DataFrame, polars.Series, xarray.DataArray
         A value to associate with this variable.
     dims : str or tuple of str, optional
         Dimension names of the random variables (as opposed to the shapes of these

diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
@@ -18,6 +18,10 @@
 
 import numpy as np
 import pandas as pd
+try:
+    import polars as pl
+except ImportError:
+    pl = None
 import pytensor
 import pytensor.tensor as pt
 import scipy.sparse as sps
@@ -111,6 +115,18 @@
             ret = np.ma.MaskedArray(vals, mask)
         else:
             ret = vals
+    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
-    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+        # Probably polars object
-    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+    elif pl is not None and isinstance(data, (pl.DataFrame, pl.Series)):
-    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+        # Probably polars object
-    elif hasattr(data, "to_numpy") and hasattr(data, "is_null"):
+    elif pl is not None and isinstance(data, (pl.DataFrame, pl.Series)):
+        vals = data.to_numpy()
+        try:
+            null_data = data.is_null()
+        except AttributeError:
+            null_data = data.with_columns(pl.all().is_null())
+        mask = null_data.to_numpy()
+        if mask.any():
+            # there are missing values
+            ret = np.ma.MaskedArray(vals, mask)
+        else:
+            ret = vals
     elif isinstance(data, np.ndarray):
         if isinstance(data, np.ma.MaskedArray):
             if not data.mask.any():
@@ -141,11 +157,18 @@
     # Otherwise we only convert the precision.
     return smarttypeX(ret)
 
-
-@_as_tensor_variable.register(pd.Series)
-@_as_tensor_variable.register(pd.DataFrame)
-def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
-    return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+if pl is not None:
+    @_as_tensor_variable.register(pd.Series)
+    @_as_tensor_variable.register(pd.DataFrame)
+    @_as_tensor_variable.register(pl.DataFrame)
+    @_as_tensor_variable.register(pl.Series)
+    def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable:
+        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+else:
+    @_as_tensor_variable.register(pd.Series)
+    @_as_tensor_variable.register(pd.DataFrame)
+    def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
+        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
-if pl is not None:
-    @_as_tensor_variable.register(pd.Series)
-    @_as_tensor_variable.register(pd.DataFrame)
-    @_as_tensor_variable.register(pl.DataFrame)
-    @_as_tensor_variable.register(pl.Series)
-    def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable:
-        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
-else:
-    @_as_tensor_variable.register(pd.Series)
-    @_as_tensor_variable.register(pd.DataFrame)
-    def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
-        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+@_as_tensor_variable.register(pd.Series)
+@_as_tensor_variable.register(pd.DataFrame)
+def dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
+    return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+
+if pl is not None:
+    @_as_tensor_variable.register(pl.DataFrame)
+    @_as_tensor_variable.register(pl.Series)
+    def polars_dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
+        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+
-if pl is not None:
-    @_as_tensor_variable.register(pd.Series)
-    @_as_tensor_variable.register(pd.DataFrame)
-    @_as_tensor_variable.register(pl.DataFrame)
-    @_as_tensor_variable.register(pl.Series)
-    def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable:
-        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
-else:
-    @_as_tensor_variable.register(pd.Series)
-    @_as_tensor_variable.register(pd.DataFrame)
-    def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
-        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+@_as_tensor_variable.register(pd.Series)
+@_as_tensor_variable.register(pd.DataFrame)
+def dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
+    return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+
+if pl is not None:
+    @_as_tensor_variable.register(pl.DataFrame)
+    @_as_tensor_variable.register(pl.Series)
+    def polars_dataframe_to_tensor_variable(df, *args, **kwargs) -> TensorVariable:
+        return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+
 
 
 def extract_obs_data(x: TensorVariable) -> np.ndarray:

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -404,6 +404,21 @@ def test_implicit_coords_dataframe(self, seeded_test):
         assert "columns" in pmodel.coords
         assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")}
 
+    def test_implicit_coords_polars_dataframe(self, seeded_test):
+        pl = pytest.importorskip("polars")
+        N_rows = 5
+        N_cols = 7
+        df_data = pl.DataFrame({f"Column {c+1}": np.random.normal(size=(N_rows,)) for c in range(N_cols)})
+        df_data = df_data.with_row_count("rows")
+
+        # infer coordinates from index and columns of the DataFrame
+        with pm.Model() as pmodel:
+            pm.Data("observations", df_data, dims=("rows", "columns"), infer_dims_and_coords=True)
+
+        assert "rows" in pmodel.coords
+        assert "columns" in pmodel.coords
+        assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")}
+
     def test_implicit_coords_xarray(self):
         xr = pytest.importorskip("xarray")
         data = xr.DataArray([[1, 2, 3], [4, 5, 6]], dims=("y", "x"))

diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py
@@ -66,6 +66,19 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
     df = pd.DataFrame(np_array)
     np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)
 
+@pytest.mark.parametrize(
+    argnames="np_array",
+    argvalues=[
+        np.array([[1.0], [2.0], [-1.0]]),
+        np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]),
+        np.ones(shape=(10, 1)),
+    ],
+)
+def test_polars_dataframe_as_tensor_variable(np_array: np.ndarray) -> None:
+    pl = pytest.importorskip("polars")
+    df = pl.DataFrame(np_array)
+    np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array)
+
 
 @pytest.mark.parametrize(
     argnames="np_array",