From 8cc283e0251fb23eaba4d4175919a282330da47b Mon Sep 17 00:00:00 2001 From: Jason Preszler Date: Tue, 12 Sep 2023 10:35:32 -0700 Subject: [PATCH] remove redundencies, clean up some wording --- causalpy/data/simulate_data.py | 2 +- causalpy/pymc_experiments.py | 221 ++++++++------------------------- causalpy/pymc_models.py | 170 +++++++------------------ causalpy/skl_experiments.py | 59 +++------ 4 files changed, 115 insertions(+), 337 deletions(-) diff --git a/causalpy/data/simulate_data.py b/causalpy/data/simulate_data.py index e99f2eac..a2c9fa20 100644 --- a/causalpy/data/simulate_data.py +++ b/causalpy/data/simulate_data.py @@ -46,7 +46,7 @@ def generate_synthetic_control_data( :param N: Number fo data points :param treatment_time: - Index where treatment begins in the generated data frame + Index where treatment begins in the generated dataframe :param grw_mu: Mean of Gaussian Random Walk :param grw_sigma: diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py index 69f0f25a..95fdafe4 100644 --- a/causalpy/pymc_experiments.py +++ b/causalpy/pymc_experiments.py @@ -52,25 +52,6 @@ def __init__(self, model=None, **kwargs): def idata(self): """ Access to the models InferenceData object - - Example - -------- - >>> import causalpy as cp - >>> df = cp.load_data("did") - >>> seed = 42 - >>> result = cp.pymc_experiments.DifferenceInDifferences( - ... df, - ... formula="y ~ 1 + group*post_treatment", - ... time_variable_name="t", - ... group_variable_name="group", - ... model=cp.pymc_models.LinearRegression( - ... sample_kwargs={"random_seed": seed, "progressbar": False}), - ... ) - >>> result.idata - Inference data... - >>> result.idata.posterior - - Dimensions... """ return self.model.idata @@ -127,7 +108,7 @@ class PrePostFit(ExperimentalDesign): the pre-intervention data. :param data: - A pandas data frame + A pandas dataframe :param treatment_time: The time when treatment occured, should be in reference to the data index :param formula: @@ -153,6 +134,18 @@ class PrePostFit(ExperimentalDesign): ... } ... ), ... ) + >>> result.summary() # doctest: +NUMBER + ==================================Pre-Post Fit================================== + Formula: actual ~ 0 + a + b + c + d + e + f + g + Model coefficients: + a 0.33, 94% HDI [0.30, 0.38] + b 0.05, 94% HDI [0.01, 0.09] + c 0.31, 94% HDI [0.26, 0.35] + d 0.06, 94% HDI [0.01, 0.10] + e 0.02, 94% HDI [0.00, 0.06] + f 0.20, 94% HDI [0.12, 0.26] + g 0.04, 94% HDI [0.00, 0.08] + sigma 0.26, 94% HDI [0.22, 0.30] """ def __init__( @@ -237,10 +230,6 @@ def _input_validation(self, data, treatment_time): def plot(self, counterfactual_label="Counterfactual", **kwargs): """ Plot the results - - Example - -------- - >>> result.plot() # doctest: +SKIP """ fig, ax = plt.subplots(3, 1, sharex=True, figsize=(7, 8)) @@ -343,38 +332,6 @@ def plot(self, counterfactual_label="Counterfactual", **kwargs): def summary(self) -> None: """ Print text output summarising the results - - Example - --------- - >>> import causalpy as cp - >>> sc = cp.load_data("sc") - >>> treatment_time = 70 - >>> seed = 42 - >>> result = cp.pymc_experiments.PrePostFit( - ... sc, - ... treatment_time, - ... formula="actual ~ 0 + a + b + c + d + e + f + g", - ... model=cp.pymc_models.WeightedSumFitter( - ... sample_kwargs={ - ... "draws": 2000, - ... "target_accept": 0.95, - ... "random_seed": seed, - ... "progressbar": False, - ... } - ... ), - ... ) - >>> result.summary() # doctest: +NUMBER - ==================================Pre-Post Fit================================== - Formula: actual ~ 0 + a + b + c + d + e + f + g - Model coefficients: - a 0.34, 94% HDI [0.30, 0.38] - b 0.05, 94% HDI [0.01, 0.09] - c 0.31, 94% HDI [0.26, 0.35] - d 0.06, 94% HDI [0.01, 0.10] - e 0.0, 94% HDI [0.0, 0.0] - f 0.1, 94% HDI [0.1, 0.2] - g 0.0, 94% HDI [0.0, 0.0] - sigma 0.26, 94% HDI [0.22, 0.30] """ print(f"{self.expt_type:=^80}") @@ -388,7 +345,7 @@ class InterruptedTimeSeries(PrePostFit): A wrapper around PrePostFit class :param data: - A pandas data frame + A pandas dataframe :param treatment_time: The time when treatment occured, should be in reference to the data index :param formula: @@ -427,7 +384,7 @@ class SyntheticControl(PrePostFit): """A wrapper around the PrePostFit class :param data: - A pandas data frame + A pandas dataframe :param treatment_time: The time when treatment occured, should be in reference to the data index :param formula: @@ -477,7 +434,7 @@ class DifferenceInDifferences(ExperimentalDesign): There is no pre/post intervention data distinction for DiD, we fit all the data available. :param data: - A pandas data frame + A pandas dataframe :param formula: A statistical model formula :param time_variable_name: @@ -505,6 +462,18 @@ class DifferenceInDifferences(ExperimentalDesign): ... } ... ) ... ) + >>> result.summary() # doctest: +NUMBER + ===========================Difference in Differences============================ + Formula: y ~ 1 + group*post_treatment + + Results: + Causal impact = 0.5, $CI_{94%}$[0.4, 0.6] + Model coefficients: + Intercept 1.0, 94% HDI [1.0, 1.1] + post_treatment[T.True] 0.9, 94% HDI [0.9, 1.0] + group 0.1, 94% HDI [0.0, 0.2] + group:post_treatment[T.True] 0.5, 94% HDI [0.4, 0.6] + sigma 0.0, 94% HDI [0.0, 0.1] """ def __init__( @@ -625,12 +594,6 @@ def _input_validation(self): def plot(self): """Plot the results. Creating the combined mean + HDI legend entries is a bit involved. - - Example - -------- - Assuming `result` is the result of a DiD experiment: - - >>> result.plot() # doctest: +SKIP """ fig, ax = plt.subplots() @@ -769,38 +732,6 @@ def _causal_impact_summary_stat(self) -> str: def summary(self) -> None: """ Print text output summarising the results - - Example - -------- - >>> import causalpy as cp - >>> df = cp.load_data("did") - >>> seed = 42 - >>> result = cp.pymc_experiments.DifferenceInDifferences( - ... df, - ... formula="y ~ 1 + group*post_treatment", - ... time_variable_name="t", - ... group_variable_name="group", - ... model=cp.pymc_models.LinearRegression( - ... sample_kwargs={ - ... "draws": 2000, - ... "target_accept": 0.95, - ... "random_seed": seed, - ... "progressbar": False, - ... } - ... ) - ... ) - >>> result.summary() # doctest: +NUMBER - ===========================Difference in Differences============================ - Formula: y ~ 1 + group*post_treatment - - Results: - Causal impact = 0.5, $CI_{94%}$[0.4, 0.6] - Model coefficients: - Intercept 1.0, 94% HDI [1.0, 1.1] - post_treatment[T.True] 0.9, 94% HDI [0.9, 1.0] - group 0.1, 94% HDI [0.0, 0.2] - group:post_treatment[T.True] 0.5, 94% HDI [0.4, 0.6] - sigma 0.0, 94% HDI [0.0, 0.1] """ print(f"{self.expt_type:=^80}") @@ -849,6 +780,20 @@ class RegressionDiscontinuity(ExperimentalDesign): ... ), ... treatment_threshold=0.5, ... ) + >>> result.summary() # doctest: +NUMBER + ============================Regression Discontinuity============================ + Formula: y ~ 1 + x + treated + x:treated + Running variable: x + Threshold on running variable: 0.5 + + Results: + Discontinuity at threshold = 0.91 + Model coefficients: + Intercept 0.09, 94% HDI [-0.00, 0.17] + treated[T.True] 2.45, 94% HDI [1.66, 3.28] + x 1.32, 94% HDI [1.14, 1.50] + x:treated[T.True] -3.08, 94% HDI [-4.17, -2.05] + sigma 0.36, 94% HDI [0.31, 0.41] """ def __init__( @@ -961,10 +906,6 @@ def _is_treated(self, x): def plot(self): """ Plot the results - - Example - -------- - >>> result.plot() # doctest: +SKIP """ fig, ax = plt.subplots() # Plot raw data @@ -1013,39 +954,6 @@ def plot(self): def summary(self) -> None: """ Print text output summarising the results - - Example - -------- - >>> import causalpy as cp - >>> df = cp.load_data("rd") - >>> seed = 42 - >>> result = cp.pymc_experiments.RegressionDiscontinuity( - ... df, - ... formula="y ~ 1 + x + treated + x:treated", - ... model=cp.pymc_models.LinearRegression( - ... sample_kwargs={ - ... "draws": 2000, - ... "target_accept": 0.95, - ... "random_seed": seed, - ... "progressbar": False, - ... }, - ... ), - ... treatment_threshold=0.5, - ... ) - >>> result.summary() # doctest: +NUMBER - ============================Regression Discontinuity============================ - Formula: y ~ 1 + x + treated + x:treated - Running variable: x - Threshold on running variable: 0.5 - - Results: - Discontinuity at threshold = 0.91 - Model coefficients: - Intercept 0.0, 94% HDI [0.0, 0.1] - treated[T.True] 2.4, 94% HDI [1.6, 3.2] - x 1.32, 94% HDI [1.14, 1.50] - x:treated[T.True] -3.09, 94% HDI [-4.16, -2.03] - sigma 0.36, 94% HDI [0.31, 0.41] """ print(f"{self.expt_type:=^80}") @@ -1064,7 +972,7 @@ class PrePostNEGD(ExperimentalDesign): A class to analyse data from pretest/posttest designs :param data: - A pandas data frame + A pandas dataframe :param formula: A statistical model formula :param group_variable_name: @@ -1092,6 +1000,17 @@ class PrePostNEGD(ExperimentalDesign): ... } ... ) ... ) + >>> result.summary() # doctest: +NUMBER + ==================Pretest/posttest Nonequivalent Group Design=================== + Formula: post ~ 1 + C(group) + pre + + Results: + Causal impact = 1.8, $CI_{94%}$[1.6, 2.0] + Model coefficients: + Intercept -0.4, 94% HDI [-1.2, 0.2] + C(group)[T.1] 1.8, 94% HDI [1.6, 2.0] + pre 1.0, 94% HDI [0.9, 1.1] + sigma 0.5, 94% HDI [0.4, 0.5] """ def __init__( @@ -1227,38 +1146,6 @@ def _causal_impact_summary_stat(self) -> str: def summary(self) -> None: """ Print text output summarising the results - - Example - -------- - >>> import causalpy as cp - >>> df = cp.load_data("anova1") - >>> seed = 42 - >>> result = cp.pymc_experiments.PrePostNEGD( - ... df, - ... formula="post ~ 1 + C(group) + pre", - ... group_variable_name="group", - ... pretreatment_variable_name="pre", - ... model=cp.pymc_models.LinearRegression( - ... sample_kwargs={ - ... "draws": 2000, - ... "target_accept": 0.95, - ... "random_seed": seed, - ... "progressbar": False, - ... } - ... ) - ... ) - >>> result.summary() # doctest: +NUMBER - ==================Pretest/posttest Nonequivalent Group Design=================== - Formula: post ~ 1 + C(group) + pre - - Results: - Causal impact = 1.8, $CI_{94%}$[1.6, 2.0] - Model coefficients: - Intercept -0.4, 94% HDI [-1.2, 0.2] - C(group)[T.1] 1.8, 94% HDI [1.6, 2.0] - pre 1.0, 94% HDI [0.9, 1.1] - sigma 0.5, 94% HDI [0.4, 0.5] - """ print(f"{self.expt_type:=^80}") diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py index be93c188..834b9c66 100644 --- a/causalpy/pymc_models.py +++ b/causalpy/pymc_models.py @@ -30,7 +30,43 @@ class ModelBuilder(pm.Model): - build_model: must be implemented by subclasses - fit: populates idata attribute - predict: returns predictions on new data - - score: returns Bayesian :math: `R^2` + - score: returns Bayesian :math:`R^2` + + Example + ------- + >>> import causalpy as cp + >>> import numpy as np + >>> import pymc as pm + >>> from causalpy.pymc_models import ModelBuilder + >>> class MyToyModel(ModelBuilder): + ... def build_model(self, X, y, coords): + ... with self: + ... X_ = pm.MutableData(name="X", value=X) + ... y_ = pm.MutableData(name="y", value=y) + ... beta = pm.Normal("beta", mu=0, sigma=1, shape=X_.shape[1]) + ... sigma = pm.HalfNormal("sigma", sigma=1) + ... mu = pm.Deterministic("mu", pm.math.dot(X_, beta)) + ... pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) + >>> rng = np.random.default_rng(seed=42) + >>> X = rng.normal(loc=0, scale=1, size=(20, 2)) + >>> y = rng.normal(loc=0, scale=1, size=(20,)) + >>> model = MyToyModel( + ... sample_kwargs={ + ... "chains": 2, + ... "draws": 2000, + ... "progressbar": False, + ... "random_seed": rng, + ... } + ... ) + >>> model.fit(X, y) + Inference... + >>> X_new = rng.normal(loc=0, scale=1, size=(20,2)) + >>> model.predict(X_new) + Inference... + >>> model.score(X, y) # doctest: +NUMBER + r2 0.38 + r2_std 0.08 + dtype: float64 """ def __init__(self, sample_kwargs: Optional[Dict[str, Any]] = None): @@ -43,22 +79,7 @@ def __init__(self, sample_kwargs: Optional[Dict[str, Any]] = None): self.sample_kwargs = sample_kwargs if sample_kwargs is not None else {} def build_model(self, X, y, coords) -> None: - """Build the model, must be implemented by subclass. - - Example - ------- - >>> import pymc as pm - >>> from causalpy.pymc_models import ModelBuilder - >>> class CausalPyModel(ModelBuilder): - ... def build_model(self, X, y): - ... with self: - ... X_ = pm.MutableData(name="X", value=X) - ... y_ = pm.MutableData(name="y", value=y) - ... beta = pm.Normal("beta", mu=0, sigma=1, shape=X_.shape[1]) - ... sigma = pm.HalfNormal("sigma", sigma=1) - ... mu = pm.Deterministic("mu", pm.math.dot(X_, beta)) - ... pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) - """ + """Build the model, must be implemented by subclass.""" raise NotImplementedError("This method must be implemented by a subclass") def _data_setter(self, X) -> None: @@ -74,32 +95,6 @@ def _data_setter(self, X) -> None: def fit(self, X, y, coords: Optional[Dict[str, Any]] = None) -> None: """Draw samples fromposterior, prior predictive, and posterior predictive distributions, placing them in the model's idata attribute. - - .. note:: - Calls the build_model method - - Example - ------- - >>> import numpy as np - >>> import pymc as pm - >>> from causalpy.pymc_models import ModelBuilder - >>> class MyToyModel(ModelBuilder): - ... def build_model(self, X, y, coords): - ... with self: - ... X_ = pm.MutableData(name="X", value=X) - ... y_ = pm.MutableData(name="y", value=y) - ... beta = pm.Normal("beta", mu=0, sigma=1, shape=X_.shape[1]) - ... sigma = pm.HalfNormal("sigma", sigma=1) - ... mu = pm.Deterministic("mu", pm.math.dot(X_, beta)) - ... pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) - >>> rng = np.random.default_rng(seed=42) - >>> X = rng.normal(loc=0, scale=1, size=(20, 2)) - >>> y = rng.normal(loc=0, scale=1, size=(20,)) - >>> model = MyToyModel( - ... sample_kwargs={"chains": 2, "draws": 2, "progressbar": False} - ... ) - >>> model.fit(X, y) - Inference ... """ self.build_model(X, y, coords) with self.model: @@ -117,32 +112,6 @@ def predict(self, X): .. caution:: Results in KeyError if model hasn't been fit. - Example - ------- - >>> import causalpy as cp - >>> import numpy as np - >>> import pymc as pm - >>> from causalpy.pymc_models import ModelBuilder - >>> class MyToyModel(ModelBuilder): - ... def build_model(self, X, y, coords): - ... with self: - ... X_ = pm.MutableData(name="X", value=X) - ... y_ = pm.MutableData(name="y", value=y) - ... beta = pm.Normal("beta", mu=0, sigma=1, shape=X_.shape[1]) - ... sigma = pm.HalfNormal("sigma", sigma=1) - ... mu = pm.Deterministic("mu", pm.math.dot(X_, beta)) - ... pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) - >>> rng = np.random.default_rng(seed=42) - >>> X = rng.normal(loc=0, scale=1, size=(20, 2)) - >>> y = rng.normal(loc=0, scale=1, size=(20,)) - >>> model = MyToyModel( - ... sample_kwargs={"chains": 2, "draws": 2, "progressbar": False} - ... ) - >>> model.fit(X, y) - Inference... - >>> X_new = rng.normal(loc=0, scale=1, size=(20,2)) - >>> model.predict(X_new) - Inference... """ self._data_setter(X) @@ -160,38 +129,6 @@ def score(self, X, y) -> pd.Series: The Bayesian :math:`R^2` is not the same as the traditional coefficient of determination, https://en.wikipedia.org/wiki/Coefficient_of_determination. - Example - -------- - >>> import causalpy as cp - >>> import numpy as np - >>> import pymc as pm - >>> from causalpy.pymc_models import ModelBuilder - >>> class MyToyModel(ModelBuilder): - ... def build_model(self, X, y, coords): - ... with self: - ... X_ = pm.MutableData(name="X", value=X) - ... y_ = pm.MutableData(name="y", value=y) - ... beta = pm.Normal("beta", mu=0, sigma=1, shape=X_.shape[1]) - ... sigma = pm.HalfNormal("sigma", sigma=1) - ... mu = pm.Deterministic("mu", pm.math.dot(X_, beta)) - ... pm.Normal("y_hat", mu=mu, sigma=sigma, observed=y_) - >>> rng = np.random.default_rng(seed=42) - >>> X = rng.normal(loc=0, scale=1, size=(200, 2)) - >>> y = rng.normal(loc=0, scale=1, size=(200,)) - >>> model = MyToyModel( - ... sample_kwargs={ - ... "chains": 2, - ... "draws": 2000, - ... "progressbar": False, - ... "random_seed": rng - ... } - ... ) - >>> model.fit(X, y) - Inference... - >>> model.score(X, y) # doctest: +NUMBER - r2 0.34 - r2_std 0.02 - dtype: float64 """ yhat = self.predict(X) yhat = az.extract( @@ -207,7 +144,7 @@ class WeightedSumFitter(ModelBuilder): """ Used for synthetic control experiments - .. note: Generally, the `.fit()` method should be rather than calling + .. note: Generally, the `.fit()` method should be used rather than calling `.build_model()` directly. Defines the PyMC model: @@ -237,19 +174,8 @@ class WeightedSumFitter(ModelBuilder): def build_model(self, X, y, coords): """ - Defines the PyMC model: - - .. math:: - - sigma \sim HalfNormal(1) - - beta \sim Dirichlet(1,...,1) - - mu = X * beta - - y \sim Normal(mu, sigma) - - """ # noqa: W605 + Defines the PyMC model + """ with self: self.add_coords(coords) n_predictors = X.shape[1] @@ -270,7 +196,7 @@ class LinearRegression(ModelBuilder): """ Custom PyMC model for linear regression - .. note: Generally, the `.fit()` method should be rather than calling + .. note: Generally, the `.fit()` method should be used rather than calling `.build_model()` directly. Defines the PyMC model @@ -304,17 +230,7 @@ class LinearRegression(ModelBuilder): def build_model(self, X, y, coords): """ Defines the PyMC model - - .. math:: - beta \sim Normal(0, 50) - - sigma \sim HalfNormal(1) - - mu = X * beta - - y \sim Normal(mu, sigma) - - """ # noqa: W605 + """ with self: self.add_coords(coords) X = pm.MutableData("X", X, dims=["obs_ind", "coeffs"]) diff --git a/causalpy/skl_experiments.py b/causalpy/skl_experiments.py index 8be30d4e..9db34736 100644 --- a/causalpy/skl_experiments.py +++ b/causalpy/skl_experiments.py @@ -1,7 +1,7 @@ """ Experiments for Scikit-Learn models -- ExperimentalDesign: base class for skl experiments +- ExperimentalDesign: base class for scikit-learn experiments - PrePostFit: base class for synthetic control and interrupted time series - SyntheticControl - InterruptedTimeSeries @@ -45,7 +45,7 @@ class PrePostFit(ExperimentalDesign): :param formula: A statistical model formula :param model: - An sklearn model object + An scikit-learn model object Example -------- @@ -59,6 +59,8 @@ class PrePostFit(ExperimentalDesign): ... formula="actual ~ 0 + a + b + c + d + e + f + g", ... model = cp.skl_models.WeightedProportion() ... ) + >>> result.get_coeffs() + array(...) """ def __init__( @@ -177,21 +179,6 @@ def plot(self, counterfactual_label="Counterfactual", **kwargs): def get_coeffs(self): """ Returns model coefficients - - Example - -------- - >>> from sklearn.linear_model import LinearRegression - >>> import causalpy as cp - >>> df = cp.load_data("sc") - >>> treatment_time = 70 - >>> result = cp.skl_experiments.PrePostFit( - ... df, - ... treatment_time, - ... formula="actual ~ 0 + a + b + c + d + e + f + g", - ... model = cp.skl_models.WeightedProportion() - ... ) - >>> result.get_coeffs() - array(...) """ return np.squeeze(self.model.coef_) @@ -299,7 +286,7 @@ class DifferenceInDifferences(ExperimentalDesign): :param group_variable_name: Name of the data column for the group variable :param model: - An skl model for difference in differences + An scikit-learn model for difference in differences Example -------- @@ -513,6 +500,18 @@ class RegressionDiscontinuity(ExperimentalDesign): ... model=LinearRegression(), ... treatment_threshold=0.5, ... ) + >>> result.summary() # doctest: +NORMALIZE_WHITESPACE,+NUMBER + Difference in Differences experiment + Formula: y ~ 1 + x + treated + Running variable: x + Threshold on running variable: 0.5 + + Results: + Discontinuity at threshold = 0.19 + Model coefficients: + Intercept 0.0 + treated[T.True] 0.19 + x 1.23 """ def __init__( @@ -645,30 +644,6 @@ def plot(self): def summary(self): """ Print text output summarising the results - - Example - -------- - >>> import causalpy as cp - >>> from sklearn.linear_model import LinearRegression - >>> data = cp.load_data("rd") - >>> result = cp.skl_experiments.RegressionDiscontinuity( - ... data, - ... formula="y ~ 1 + x + treated", - ... model=LinearRegression(), - ... treatment_threshold=0.5, - ... ) - >>> result.summary() # doctest: +NORMALIZE_WHITESPACE,+NUMBER - Difference in Differences experiment - Formula: y ~ 1 + x + treated - Running variable: x - Threshold on running variable: 0.5 - - Results: - Discontinuity at threshold = 0.19 - Model coefficients: - Intercept 0.0 - treated[T.True] 0.19 - x 1.23 """ print("Difference in Differences experiment") print(f"Formula: {self.formula}")