Skip to content

Commit

Permalink
convert scikitlearn models behind the scenes
Browse files Browse the repository at this point in the history
  • Loading branch information
drbenvincent committed Aug 9, 2024
1 parent dede64a commit 02dacb2
Show file tree
Hide file tree
Showing 16 changed files with 994 additions and 1,027 deletions.
13 changes: 10 additions & 3 deletions causalpy/experiments/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@

from abc import abstractmethod

from sklearn.base import RegressorMixin

from causalpy.pymc_models import PyMCModel
from causalpy.skl_models import ScikitLearnModel
from causalpy.skl_models import create_causalpy_compatible_class


class BaseExperiment:
Expand All @@ -28,13 +30,18 @@ class BaseExperiment:
supports_ols: bool

def __init__(self, model=None):
# Ensure we've made any provided Scikit Learn model (as identified as being type
# RegressorMixin) compatible with CausalPy by appending our custom methods.
if isinstance(model, RegressorMixin):
model = create_causalpy_compatible_class(model)

if model is not None:
self.model = model

if isinstance(self.model, PyMCModel) and not self.supports_bayes:
raise ValueError("Bayesian models not supported.")

Check warning on line 42 in causalpy/experiments/base.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/base.py#L42

Added line #L42 was not covered by tests

if isinstance(self.model, ScikitLearnModel) and not self.supports_ols:
if isinstance(self.model, RegressorMixin) and not self.supports_ols:
raise ValueError("OLS models not supported.")

if self.model is None:
Expand All @@ -57,7 +64,7 @@ def plot(self, *args, **kwargs) -> tuple:
"""
if isinstance(self.model, PyMCModel):
return self.bayesian_plot(*args, **kwargs)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
return self.ols_plot(*args, **kwargs)
else:
raise ValueError("Unsupported model type")

Check warning on line 70 in causalpy/experiments/base.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/base.py#L70

Added line #L70 was not covered by tests
Expand Down
6 changes: 3 additions & 3 deletions causalpy/experiments/diff_in_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
import seaborn as sns
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
DataException,
FormulaException,
)
from causalpy.plot_utils import plot_xY
from causalpy.pymc_models import PyMCModel
from causalpy.skl_models import ScikitLearnModel
from causalpy.utils import _is_variable_dummy_coded, convert_to_string, round_num

from .base import BaseExperiment
Expand Down Expand Up @@ -106,7 +106,7 @@ def __init__(
if isinstance(self.model, PyMCModel):
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
self.model.fit(X=self.X, y=self.y, coords=COORDS)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
self.model.fit(X=self.X, y=self.y)
else:
raise ValueError("Model type not recognized")

Check warning on line 112 in causalpy/experiments/diff_in_diff.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/diff_in_diff.py#L112

Added line #L112 was not covered by tests
Expand Down Expand Up @@ -181,7 +181,7 @@ def __init__(
self.causal_impact = self.model.idata.posterior["beta"].isel(
{"coeffs": i}
)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
# This is the coefficient on the interaction term
# TODO: THIS IS NOT YET CORRECT ?????
self.causal_impact = (
Expand Down
4 changes: 2 additions & 2 deletions causalpy/experiments/prepostfit.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
import pandas as pd
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import BadIndexException
from causalpy.plot_utils import plot_xY
from causalpy.pymc_models import PyMCModel
from causalpy.skl_models import ScikitLearnModel
from causalpy.utils import round_num

from .base import BaseExperiment
Expand Down Expand Up @@ -77,7 +77,7 @@ def __init__(
if isinstance(self.model, PyMCModel):
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.pre_X.shape[0])}
self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
self.model.fit(X=self.pre_X, y=self.pre_y)
else:
raise ValueError("Model type not recognized")

Check warning on line 83 in causalpy/experiments/prepostfit.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/prepostfit.py#L83

Added line #L83 was not covered by tests
Expand Down
4 changes: 2 additions & 2 deletions causalpy/experiments/prepostnegd.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@
import seaborn as sns
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
DataException,
)
from causalpy.plot_utils import plot_xY
from causalpy.pymc_models import PyMCModel
from causalpy.skl_models import ScikitLearnModel
from causalpy.utils import _is_variable_dummy_coded, round_num

from .base import BaseExperiment
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(
if isinstance(self.model, PyMCModel):
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
self.model.fit(X=self.X, y=self.y, coords=COORDS)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
raise NotImplementedError("Not implemented for OLS model")

Check warning on line 119 in causalpy/experiments/prepostnegd.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/prepostnegd.py#L118-L119

Added lines #L118 - L119 were not covered by tests
else:
raise ValueError("Model type not recognized")

Check warning on line 121 in causalpy/experiments/prepostnegd.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/prepostnegd.py#L121

Added line #L121 was not covered by tests
Expand Down
4 changes: 2 additions & 2 deletions causalpy/experiments/regression_discontinuity.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
import seaborn as sns
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
DataException,
FormulaException,
)
from causalpy.plot_utils import plot_xY
from causalpy.pymc_models import PyMCModel
from causalpy.skl_models import ScikitLearnModel
from causalpy.utils import _is_variable_dummy_coded, convert_to_string, round_num

from .base import BaseExperiment
Expand Down Expand Up @@ -126,7 +126,7 @@ def __init__(
# fit the model to the observed (pre-intervention) data
COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
self.model.fit(X=self.X, y=self.y, coords=COORDS)
elif isinstance(self.model, ScikitLearnModel):
elif isinstance(self.model, RegressorMixin):
self.model.fit(X=self.X, y=self.y)
else:
raise ValueError("Model type not recognized")

Check warning on line 132 in causalpy/experiments/regression_discontinuity.py

View check run for this annotation

Codecov / codecov/patch

causalpy/experiments/regression_discontinuity.py#L132

Added line #L132 was not covered by tests
Expand Down
24 changes: 16 additions & 8 deletions causalpy/skl_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from causalpy.utils import round_num


class ScikitLearnModel:
class ScikitLearnAdaptor:
"""Base class for scikit-learn models that can be used for causal inference."""

def calculate_impact(self, y_true, y_pred):
Expand Down Expand Up @@ -53,7 +53,7 @@ def get_coeffs(self):
return np.squeeze(self.coef_)


class WeightedProportion(ScikitLearnModel, LinearModel, RegressorMixin):
class WeightedProportion(ScikitLearnAdaptor, LinearModel, RegressorMixin):
"""Weighted proportion model for causal inference. Used for synthetic control
methods for example"""

Expand Down Expand Up @@ -82,11 +82,19 @@ def predict(self, X):

def create_causalpy_compatible_class(
estimator: type[RegressorMixin],
) -> type[ScikitLearnModel]:
) -> type[RegressorMixin]:
"""This function takes a scikit-learn estimator and returns a new class that is
compatible with CausalPy."""

class Model(ScikitLearnModel, estimator):
pass

return Model
_add_mixin_methods(estimator, ScikitLearnAdaptor)
return estimator


def _add_mixin_methods(model_instance, mixin_class):
"""Utility function to bind mixin methods to an existing model instance."""
for attr_name in dir(mixin_class):
attr = getattr(mixin_class, attr_name)
if callable(attr) and not attr_name.startswith("__"):
# Bind the method to the instance
method = attr.__get__(model_instance, model_instance.__class__)
setattr(model_instance, attr_name, method)
return model_instance
5 changes: 2 additions & 3 deletions causalpy/tests/test_input_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

from sklearn.linear_model import LinearRegression

CustomLinearRegression = cp.create_causalpy_compatible_class(LinearRegression)

sample_kwargs = {"tune": 20, "draws": 20, "chains": 2, "cores": 2}

Expand Down Expand Up @@ -254,7 +253,7 @@ def test_rd_validation_treated_in_formula():
_ = cp.RegressionDiscontinuity(
df,
formula="y ~ 1 + x",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
)

Expand All @@ -281,7 +280,7 @@ def test_rd_validation_treated_is_dummy():
_ = cp.RegressionDiscontinuity(
df,
formula="y ~ 1 + x + treated",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
)

Expand Down
30 changes: 11 additions & 19 deletions causalpy/tests/test_integration_skl_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
from sklearn.linear_model import LinearRegression

import causalpy as cp
from causalpy.skl_models import ScikitLearnModel

CustomLinearRegression = cp.create_causalpy_compatible_class(LinearRegression)


@pytest.mark.integration
Expand All @@ -42,7 +39,7 @@ def test_did():
group_variable_name="group",
treated=1,
untreated=0,
model=CustomLinearRegression(),
model=LinearRegression(),
)
assert isinstance(data, pd.DataFrame)
assert isinstance(result, cp.DifferenceInDifferences)
Expand Down Expand Up @@ -71,7 +68,7 @@ def test_rd_drinking():
df,
formula="all ~ 1 + age + treated",
running_variable_name="age",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=21,
epsilon=0.001,
)
Expand Down Expand Up @@ -103,7 +100,7 @@ def test_its():
df,
treatment_time,
formula="y ~ 1 + t + C(month)",
model=CustomLinearRegression(),
model=LinearRegression(),
)
assert isinstance(df, pd.DataFrame)
assert isinstance(result, cp.InterruptedTimeSeries)
Expand Down Expand Up @@ -165,7 +162,7 @@ def test_rd_linear_main_effects():
result = cp.RegressionDiscontinuity(
data,
formula="y ~ 1 + x + treated",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
epsilon=0.001,
)
Expand All @@ -191,7 +188,7 @@ def test_rd_linear_main_effects_bandwidth():
result = cp.skl_experiments.RegressionDiscontinuity(
data,
formula="y ~ 1 + x + treated",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
epsilon=0.001,
bandwidth=0.3,
Expand All @@ -217,7 +214,7 @@ def test_rd_linear_with_interaction():
result = cp.RegressionDiscontinuity(
data,
formula="y ~ 1 + x + treated + x:treated",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
epsilon=0.001,
)
Expand All @@ -238,18 +235,13 @@ def test_rd_linear_with_gaussian_process():
1. data is a dataframe
2. skl_experiements.RegressionDiscontinuity returns correct type
"""

# create a custom GaussianProcessRegressor class by subclassing
# GaussianProcessRegressor and adding the ScikitLearnModel mixin
class CustomGaussianProcessRegressor(GaussianProcessRegressor, ScikitLearnModel):
pass

data = cp.load_data("rd")
kernel = 1.0 * ExpSineSquared(1.0, 5.0) + WhiteKernel(1e-1)
result = cp.RegressionDiscontinuity(
data,
formula="y ~ 1 + x + treated",
model=CustomGaussianProcessRegressor(kernel=kernel),
model=GaussianProcessRegressor(kernel=kernel),
model_kwargs={"kernel": kernel},
treatment_threshold=0.5,
epsilon=0.001,
)
Expand All @@ -275,7 +267,7 @@ def test_did_deprecation_warning():
group_variable_name="group",
treated=1,
untreated=0,
model=CustomLinearRegression(),
model=LinearRegression(),
)
assert isinstance(result, cp.DifferenceInDifferences)

Expand All @@ -294,7 +286,7 @@ def test_its_deprecation_warning():
df,
treatment_time,
formula="y ~ 1 + t + C(month)",
model=CustomLinearRegression(),
model=LinearRegression(),
)
assert isinstance(result, cp.InterruptedTimeSeries)

Expand Down Expand Up @@ -322,7 +314,7 @@ def test_rd_deprecation_warning():
result = cp.skl_experiments.RegressionDiscontinuity(
data,
formula="y ~ 1 + x + treated",
model=CustomLinearRegression(),
model=LinearRegression(),
treatment_threshold=0.5,
epsilon=0.001,
)
Expand Down
Loading

0 comments on commit 02dacb2

Please sign in to comment.