Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't run prior and posterior predictive when calling fit #365

Merged
merged 9 commits into from
Sep 1, 2023
2 changes: 1 addition & 1 deletion pymc_marketing/clv/models/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def fit_summary(self, **kwargs):
def output_var(self):
pass

def generate_and_preprocess_model_data(
def _generate_and_preprocess_model_data(
self,
X: Union[pd.DataFrame, pd.Series],
y: Union[pd.Series, np.ndarray[Any, Any]],
Expand Down
28 changes: 15 additions & 13 deletions pymc_marketing/mmm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
**kwargs,
) -> None:
self.X: Optional[pd.DataFrame] = None
self.y: Optional[pd.Series] = None
self.y: Optional[Union[pd.Series, np.ndarray]] = None
self.date_column: str = date_column
self.channel_columns: Union[List[str], Tuple[str]] = channel_columns
self.n_channel: int = len(channel_columns)
Expand All @@ -69,8 +69,8 @@ def methods(self) -> List[Any]:
def validation_methods(
self,
) -> Tuple[
List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]],
List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]],
List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]],
List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]],
]:
"""
A property that provides validation methods for features ("X") and the target variable ("y").
Expand Down Expand Up @@ -98,7 +98,9 @@ def validation_methods(
],
)

def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None:
def validate(
self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray]
) -> None:
"""
Validates the input data based on the specified target type.

Expand All @@ -110,7 +112,7 @@ def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None:
target : str
The type of target to be validated.
Expected values are "X" for features and "y" for the target variable.
data : Union[pd.DataFrame, pd.Series]
data : Union[pd.DataFrame, pd.Series, np.ndarray]
The input data to be validated.

Raises
Expand All @@ -134,14 +136,14 @@ def preprocessing_methods(
) -> Tuple[
List[
Callable[
["BaseMMM", Union[pd.DataFrame, pd.Series]],
Union[pd.DataFrame, pd.Series],
["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]],
Union[pd.DataFrame, pd.Series, np.ndarray],
]
],
List[
Callable[
["BaseMMM", Union[pd.DataFrame, pd.Series]],
Union[pd.DataFrame, pd.Series],
["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]],
Union[pd.DataFrame, pd.Series, np.ndarray],
]
],
]:
Expand Down Expand Up @@ -171,8 +173,8 @@ def preprocessing_methods(
)

def preprocess(
self, target: str, data: Union[pd.DataFrame, pd.Series]
) -> Union[pd.DataFrame, pd.Series]:
self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray]
) -> Union[pd.DataFrame, pd.Series, np.ndarray]:
"""
Preprocess the provided data according to the specified target.

Expand All @@ -184,12 +186,12 @@ def preprocess(
target : str
Indicates whether the data represents features ("X") or the target variable ("y").

data : pd.DataFrame
data : Union[pd.DataFrame, pd.Series, np.ndarray]
The data to be preprocessed.

Returns
-------
Union[pd.DataFrame, pd.Series]
Union[pd.DataFrame, pd.Series, np.ndarray]
The preprocessed data.

Raises
Expand Down
43 changes: 34 additions & 9 deletions pymc_marketing/mmm/delayed_saturated_mmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,11 @@ def default_sampler_config(self) -> Dict:

@property
def output_var(self):
"""Defines target variable for the model"""
return "y"

def generate_and_preprocess_model_data( # type: ignore
self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
def _generate_and_preprocess_model_data( # type: ignore
self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray]
) -> None:
"""
Applies preprocessing to the data before fitting the model.
Expand All @@ -93,8 +94,8 @@ def generate_and_preprocess_model_data( # type: ignore

Parameters
----------
X : array, shape (n_obs, n_features)
y : array, shape (n_obs,)
X : Union[pd.DataFrame, pd.Series], shape (n_obs, n_features)
y : Union[pd.Series, np.ndarray], shape (n_obs,)
"""
date_data = X[self.date_column]
channel_data = X[self.channel_columns]
Expand Down Expand Up @@ -126,11 +127,11 @@ def generate_and_preprocess_model_data( # type: ignore
self.validate("X", X_data)
self.validate("y", y)
self.preprocessed_data: Dict[str, Union[pd.DataFrame, pd.Series]] = {
"X": self.preprocess("X", X_data),
"y": self.preprocess("y", y),
"X": self.preprocess("X", X_data), # type: ignore
"y": self.preprocess("y", y), # type: ignore
}
self.X: pd.DataFrame = X_data
self.y: pd.Series = y
self.y: Union[pd.Series, np.ndarray] = y

def _save_input_params(self, idata) -> None:
"""Saves input parameters to the attrs of idata."""
Expand All @@ -144,11 +145,35 @@ def _save_input_params(self, idata) -> None:
def build_model(
self,
X: pd.DataFrame,
y: pd.Series,
y: Union[pd.Series, np.ndarray],
**kwargs,
) -> None:
"""
Builds a probabilistic model using PyMC for marketing mix modeling.

The model incorporates channels, control variables, and Fourier components, applying
adstock and saturation transformations to the channel data. The final model is
constructed with multiple factors contributing to the response variable.

Parameters
----------
X : pd.DataFrame
The input data for the model, which should include columns for channels,
control variables (if applicable), and Fourier components (if applicable).

y : Union[pd.Series, np.ndarray]
The target/response variable for the modeling.

**kwargs : dict
Additional keyword arguments that might be required by underlying methods or utilities.

Attributes Set
---------------
model : pm.Model
The PyMC model object containing all the defined stochastic and deterministic variables.
"""
model_config = self.model_config
self.generate_and_preprocess_model_data(X, y)
self._generate_and_preprocess_model_data(X, y)
with pm.Model(coords=self.model_coords) as self.model:
channel_data_ = pm.MutableData(
name="channel_data",
Expand Down
95 changes: 26 additions & 69 deletions pymc_marketing/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class ModelBuilder(ABC):
version = "None"

X: Optional[pd.DataFrame] = None
y: Optional[pd.Series] = None
y: Optional[Union[pd.Series, np.ndarray]] = None

def __init__(
self,
Expand Down Expand Up @@ -195,28 +195,28 @@ def default_sampler_config(self) -> Dict:
raise NotImplementedError

@abstractmethod
def generate_and_preprocess_model_data(
self,
X: Union[pd.DataFrame, pd.Series],
y: Union[pd.Series, np.ndarray[Any, Any]],
def _generate_and_preprocess_model_data(
self, X: Union[pd.DataFrame, pd.Series], y: np.ndarray
) -> None:
"""
Applies preprocessing to the data before fitting the model.
if validate is True, it will check if the data is valid for the model.
sets self.model_coords based on provided dataset

In case of optional parameters being passed into the model, this method should implement the conditional
logic responsible for correct handling of the optional parameters, and including them into the dataset.

Parameters:
X : array, shape (n_obs, n_features)
y : array, shape (n_obs,)

Examples
--------
>>> @classmethod
>>> def generate_and_preprocess_model_data(self, X, y):
>>> x = np.linspace(start=1, stop=50, num=100)
>>> y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 + np.random.rand(100)*6.4
>>> X = pd.DataFrame(x, columns=['x'])
>>> y = pd.Series(y, name='y')
>>> def _generate_and_preprocess_model_data(self, X, y):
coords = {
'x_dim': X.dim_variable,
} #only include if applicable for your model
>>> self.X = X
>>> self.y = y

Expand All @@ -231,7 +231,7 @@ def generate_and_preprocess_model_data(
def build_model(
self,
X: pd.DataFrame,
y: pd.Series,
y: Union[pd.Series, np.ndarray],
**kwargs,
) -> None:
"""
Expand All @@ -246,7 +246,7 @@ def build_model(
only contain the necessary data columns, not the entire available dataset, as this
will be encoded into the data used to recreate the model.

y : pd.Series
y : Union[pd.Series, np.ndarray]
The target data for the model. This should be a Series representing the output
or dependent variable for the model.

Expand All @@ -268,49 +268,6 @@ def build_model(
"""
raise NotImplementedError

def sample_model(self, **kwargs):
"""
Sample from the PyMC model.

Parameters
----------
**kwargs : dict
Additional keyword arguments to pass to the PyMC sampler.

Returns
-------
xarray.Dataset
The PyMC samples dataset.

Raises
------
RuntimeError
If the PyMC model hasn't been built yet.

Examples
--------
>>> self.build_model()
>>> idata = self.sample_model(draws=100, tune=10)
>>> assert isinstance(idata, xr.Dataset)
>>> assert "posterior" in idata
>>> assert "prior" in idata
>>> assert "observed_data" in idata
>>> assert "log_likelihood" in idata
"""
if self.model is None:
raise RuntimeError(
"The model hasn't been built yet, call .build_model() first or call .fit() instead."
)

with self.model:
sampler_args = {**self.sampler_config, **kwargs}
idata = pm.sample(**sampler_args)
idata.extend(pm.sample_prior_predictive())
idata.extend(pm.sample_posterior_predictive(idata))

idata = self.set_idata_attrs(idata)
return idata

def set_idata_attrs(self, idata=None):
"""
Set attributes on an InferenceData object.
Expand All @@ -334,11 +291,6 @@ def set_idata_attrs(self, idata=None):
>>> model = MyModel(ModelBuilder)
>>> idata = az.InferenceData(your_dataset)
>>> model.set_idata_attrs(idata=idata)
>>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual
>>> assert "model_type" in idata.attrs
>>> assert "version" in idata.attrs
>>> assert "sampler_config" in idata.attrs
>>> assert "model_config" in idata.attrs
"""
if idata is None:
idata = self.idata
Expand Down Expand Up @@ -381,7 +333,7 @@ def save(self, fname: str) -> None:
>>> def __init__(self):
>>> super().__init__()
>>> model = MyModel()
>>> model.fit(data)
>>> model.fit(X,y)
>>> model.save('model_results.nc') # This will call the overridden method in MyModel
"""
if self.idata is not None and "posterior" in self.idata:
Expand Down Expand Up @@ -468,7 +420,7 @@ def fit(
y: Optional[Union[pd.Series, np.ndarray]] = None,
progressbar: bool = True,
predictor_names: Optional[List[str]] = None,
random_seed: RandomState = None,
random_seed: Optional[RandomState] = None,
**kwargs: Any,
) -> az.InferenceData:
"""
Expand All @@ -484,10 +436,10 @@ def fit(
The target values (real numbers).
progressbar : bool
Specifies whether the fit progressbar should be displayed
predictor_names: List[str] = None,
predictor_names: Optional[List[str]] = None,
Allows for custom naming of predictors given in a form of 2dArray
allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2...
random_seed : RandomState
random_seed : Optional[RandomState]
Provides sampler with initial random seed for obtaining reproducible samples
**kwargs : Any
Custom sampler settings can be provided in form of keyword arguments.
Expand All @@ -499,7 +451,7 @@ def fit(
Examples
--------
>>> model = MyModel()
>>> idata = model.fit(data)
>>> idata = model.fit(X,y)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
"""
Expand All @@ -508,7 +460,7 @@ def fit(
if y is None:
y = np.zeros(X.shape[0])
y_df = pd.DataFrame({self.output_var: y})
self.generate_and_preprocess_model_data(X, y_df.values.flatten())
self._generate_and_preprocess_model_data(X, y_df.values.flatten())
if self.X is None or self.y is None:
raise ValueError("X and y must be set before calling build_model!")
self.build_model(self.X, self.y)
Expand All @@ -517,7 +469,12 @@ def fit(
sampler_config["progressbar"] = progressbar
sampler_config["random_seed"] = random_seed
sampler_config.update(**kwargs)
self.idata = self.sample_model(**sampler_config)

sampler_config.update(**kwargs)
if self.model is not None:
with self.model:
sampler_args = {**self.sampler_config, **kwargs}
self.idata = pm.sample(**sampler_args)

X_df = pd.DataFrame(X, columns=X.columns)
combined_data = pd.concat([X_df, y_df], axis=1)
Expand All @@ -529,7 +486,7 @@ def fit(
message="The group fit_data is not defined in the InferenceData scheme",
)
self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore

self.set_idata_attrs(self.idata)
return self.idata # type: ignore

def predict(
Expand Down Expand Up @@ -558,7 +515,7 @@ def predict(
Examples
--------
>>> model = MyModel()
>>> idata = model.fit(data)
>>> idata = model.fit(X,y)
>>> x_pred = []
>>> prediction_data = pd.DataFrame({'input':x_pred})
>>> pred_mean = model.predict(prediction_data)
Expand Down
Loading