diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py index 9173d6fc..9f43148e 100644 --- a/pymc_marketing/clv/models/basic.py +++ b/pymc_marketing/clv/models/basic.py @@ -293,7 +293,7 @@ def fit_summary(self, **kwargs): def output_var(self): pass - def generate_and_preprocess_model_data( + def _generate_and_preprocess_model_data( self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray[Any, Any]], diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py index 49f5f326..ac734cc8 100644 --- a/pymc_marketing/mmm/base.py +++ b/pymc_marketing/mmm/base.py @@ -42,7 +42,7 @@ def __init__( **kwargs, ) -> None: self.X: Optional[pd.DataFrame] = None - self.y: Optional[pd.Series] = None + self.y: Optional[Union[pd.Series, np.ndarray]] = None self.date_column: str = date_column self.channel_columns: Union[List[str], Tuple[str]] = channel_columns self.n_channel: int = len(channel_columns) @@ -69,8 +69,8 @@ def methods(self) -> List[Any]: def validation_methods( self, ) -> Tuple[ - List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]], - List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]], + List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]], + List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]], ]: """ A property that provides validation methods for features ("X") and the target variable ("y"). @@ -98,7 +98,9 @@ def validation_methods( ], ) - def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None: + def validate( + self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray] + ) -> None: """ Validates the input data based on the specified target type. @@ -110,7 +112,7 @@ def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None: target : str The type of target to be validated. Expected values are "X" for features and "y" for the target variable. - data : Union[pd.DataFrame, pd.Series] + data : Union[pd.DataFrame, pd.Series, np.ndarray] The input data to be validated. Raises @@ -134,14 +136,14 @@ def preprocessing_methods( ) -> Tuple[ List[ Callable[ - ["BaseMMM", Union[pd.DataFrame, pd.Series]], - Union[pd.DataFrame, pd.Series], + ["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], + Union[pd.DataFrame, pd.Series, np.ndarray], ] ], List[ Callable[ - ["BaseMMM", Union[pd.DataFrame, pd.Series]], - Union[pd.DataFrame, pd.Series], + ["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], + Union[pd.DataFrame, pd.Series, np.ndarray], ] ], ]: @@ -171,8 +173,8 @@ def preprocessing_methods( ) def preprocess( - self, target: str, data: Union[pd.DataFrame, pd.Series] - ) -> Union[pd.DataFrame, pd.Series]: + self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray] + ) -> Union[pd.DataFrame, pd.Series, np.ndarray]: """ Preprocess the provided data according to the specified target. @@ -184,12 +186,12 @@ def preprocess( target : str Indicates whether the data represents features ("X") or the target variable ("y"). - data : pd.DataFrame + data : Union[pd.DataFrame, pd.Series, np.ndarray] The data to be preprocessed. Returns ------- - Union[pd.DataFrame, pd.Series] + Union[pd.DataFrame, pd.Series, np.ndarray] The preprocessed data. Raises diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py index db808888..e61a8b0f 100644 --- a/pymc_marketing/mmm/delayed_saturated_mmm.py +++ b/pymc_marketing/mmm/delayed_saturated_mmm.py @@ -81,10 +81,11 @@ def default_sampler_config(self) -> Dict: @property def output_var(self): + """Defines target variable for the model""" return "y" - def generate_and_preprocess_model_data( # type: ignore - self, X: Union[pd.DataFrame, pd.Series], y: pd.Series + def _generate_and_preprocess_model_data( # type: ignore + self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray] ) -> None: """ Applies preprocessing to the data before fitting the model. @@ -93,8 +94,8 @@ def generate_and_preprocess_model_data( # type: ignore Parameters ---------- - X : array, shape (n_obs, n_features) - y : array, shape (n_obs,) + X : Union[pd.DataFrame, pd.Series], shape (n_obs, n_features) + y : Union[pd.Series, np.ndarray], shape (n_obs,) """ date_data = X[self.date_column] channel_data = X[self.channel_columns] @@ -126,11 +127,11 @@ def generate_and_preprocess_model_data( # type: ignore self.validate("X", X_data) self.validate("y", y) self.preprocessed_data: Dict[str, Union[pd.DataFrame, pd.Series]] = { - "X": self.preprocess("X", X_data), - "y": self.preprocess("y", y), + "X": self.preprocess("X", X_data), # type: ignore + "y": self.preprocess("y", y), # type: ignore } self.X: pd.DataFrame = X_data - self.y: pd.Series = y + self.y: Union[pd.Series, np.ndarray] = y def _save_input_params(self, idata) -> None: """Saves input parameters to the attrs of idata.""" @@ -144,11 +145,35 @@ def _save_input_params(self, idata) -> None: def build_model( self, X: pd.DataFrame, - y: pd.Series, + y: Union[pd.Series, np.ndarray], **kwargs, ) -> None: + """ + Builds a probabilistic model using PyMC for marketing mix modeling. + + The model incorporates channels, control variables, and Fourier components, applying + adstock and saturation transformations to the channel data. The final model is + constructed with multiple factors contributing to the response variable. + + Parameters + ---------- + X : pd.DataFrame + The input data for the model, which should include columns for channels, + control variables (if applicable), and Fourier components (if applicable). + + y : Union[pd.Series, np.ndarray] + The target/response variable for the modeling. + + **kwargs : dict + Additional keyword arguments that might be required by underlying methods or utilities. + + Attributes Set + --------------- + model : pm.Model + The PyMC model object containing all the defined stochastic and deterministic variables. + """ model_config = self.model_config - self.generate_and_preprocess_model_data(X, y) + self._generate_and_preprocess_model_data(X, y) with pm.Model(coords=self.model_coords) as self.model: channel_data_ = pm.MutableData( name="channel_data", diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py index 1e3654b0..b846d7fe 100644 --- a/pymc_marketing/model_builder.py +++ b/pymc_marketing/model_builder.py @@ -50,7 +50,7 @@ class ModelBuilder(ABC): version = "None" X: Optional[pd.DataFrame] = None - y: Optional[pd.Series] = None + y: Optional[Union[pd.Series, np.ndarray]] = None def __init__( self, @@ -195,16 +195,17 @@ def default_sampler_config(self) -> Dict: raise NotImplementedError @abstractmethod - def generate_and_preprocess_model_data( - self, - X: Union[pd.DataFrame, pd.Series], - y: Union[pd.Series, np.ndarray[Any, Any]], + def _generate_and_preprocess_model_data( + self, X: Union[pd.DataFrame, pd.Series], y: np.ndarray ) -> None: """ Applies preprocessing to the data before fitting the model. if validate is True, it will check if the data is valid for the model. sets self.model_coords based on provided dataset + In case of optional parameters being passed into the model, this method should implement the conditional + logic responsible for correct handling of the optional parameters, and including them into the dataset. + Parameters: X : array, shape (n_obs, n_features) y : array, shape (n_obs,) @@ -212,11 +213,10 @@ def generate_and_preprocess_model_data( Examples -------- >>> @classmethod - >>> def generate_and_preprocess_model_data(self, X, y): - >>> x = np.linspace(start=1, stop=50, num=100) - >>> y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 + np.random.rand(100)*6.4 - >>> X = pd.DataFrame(x, columns=['x']) - >>> y = pd.Series(y, name='y') + >>> def _generate_and_preprocess_model_data(self, X, y): + coords = { + 'x_dim': X.dim_variable, + } #only include if applicable for your model >>> self.X = X >>> self.y = y @@ -231,7 +231,7 @@ def generate_and_preprocess_model_data( def build_model( self, X: pd.DataFrame, - y: pd.Series, + y: Union[pd.Series, np.ndarray], **kwargs, ) -> None: """ @@ -246,7 +246,7 @@ def build_model( only contain the necessary data columns, not the entire available dataset, as this will be encoded into the data used to recreate the model. - y : pd.Series + y : Union[pd.Series, np.ndarray] The target data for the model. This should be a Series representing the output or dependent variable for the model. @@ -268,49 +268,6 @@ def build_model( """ raise NotImplementedError - def sample_model(self, **kwargs): - """ - Sample from the PyMC model. - - Parameters - ---------- - **kwargs : dict - Additional keyword arguments to pass to the PyMC sampler. - - Returns - ------- - xarray.Dataset - The PyMC samples dataset. - - Raises - ------ - RuntimeError - If the PyMC model hasn't been built yet. - - Examples - -------- - >>> self.build_model() - >>> idata = self.sample_model(draws=100, tune=10) - >>> assert isinstance(idata, xr.Dataset) - >>> assert "posterior" in idata - >>> assert "prior" in idata - >>> assert "observed_data" in idata - >>> assert "log_likelihood" in idata - """ - if self.model is None: - raise RuntimeError( - "The model hasn't been built yet, call .build_model() first or call .fit() instead." - ) - - with self.model: - sampler_args = {**self.sampler_config, **kwargs} - idata = pm.sample(**sampler_args) - idata.extend(pm.sample_prior_predictive()) - idata.extend(pm.sample_posterior_predictive(idata)) - - idata = self.set_idata_attrs(idata) - return idata - def set_idata_attrs(self, idata=None): """ Set attributes on an InferenceData object. @@ -334,11 +291,6 @@ def set_idata_attrs(self, idata=None): >>> model = MyModel(ModelBuilder) >>> idata = az.InferenceData(your_dataset) >>> model.set_idata_attrs(idata=idata) - >>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual - >>> assert "model_type" in idata.attrs - >>> assert "version" in idata.attrs - >>> assert "sampler_config" in idata.attrs - >>> assert "model_config" in idata.attrs """ if idata is None: idata = self.idata @@ -381,7 +333,7 @@ def save(self, fname: str) -> None: >>> def __init__(self): >>> super().__init__() >>> model = MyModel() - >>> model.fit(data) + >>> model.fit(X,y) >>> model.save('model_results.nc') # This will call the overridden method in MyModel """ if self.idata is not None and "posterior" in self.idata: @@ -468,7 +420,7 @@ def fit( y: Optional[Union[pd.Series, np.ndarray]] = None, progressbar: bool = True, predictor_names: Optional[List[str]] = None, - random_seed: RandomState = None, + random_seed: Optional[RandomState] = None, **kwargs: Any, ) -> az.InferenceData: """ @@ -484,10 +436,10 @@ def fit( The target values (real numbers). progressbar : bool Specifies whether the fit progressbar should be displayed - predictor_names: List[str] = None, + predictor_names: Optional[List[str]] = None, Allows for custom naming of predictors given in a form of 2dArray allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2... - random_seed : RandomState + random_seed : Optional[RandomState] Provides sampler with initial random seed for obtaining reproducible samples **kwargs : Any Custom sampler settings can be provided in form of keyword arguments. @@ -499,7 +451,7 @@ def fit( Examples -------- >>> model = MyModel() - >>> idata = model.fit(data) + >>> idata = model.fit(X,y) Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... """ @@ -508,7 +460,7 @@ def fit( if y is None: y = np.zeros(X.shape[0]) y_df = pd.DataFrame({self.output_var: y}) - self.generate_and_preprocess_model_data(X, y_df.values.flatten()) + self._generate_and_preprocess_model_data(X, y_df.values.flatten()) if self.X is None or self.y is None: raise ValueError("X and y must be set before calling build_model!") self.build_model(self.X, self.y) @@ -517,7 +469,12 @@ def fit( sampler_config["progressbar"] = progressbar sampler_config["random_seed"] = random_seed sampler_config.update(**kwargs) - self.idata = self.sample_model(**sampler_config) + + sampler_config.update(**kwargs) + if self.model is not None: + with self.model: + sampler_args = {**self.sampler_config, **kwargs} + self.idata = pm.sample(**sampler_args) X_df = pd.DataFrame(X, columns=X.columns) combined_data = pd.concat([X_df, y_df], axis=1) @@ -529,7 +486,7 @@ def fit( message="The group fit_data is not defined in the InferenceData scheme", ) self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore - + self.set_idata_attrs(self.idata) return self.idata # type: ignore def predict( @@ -558,7 +515,7 @@ def predict( Examples -------- >>> model = MyModel() - >>> idata = model.fit(data) + >>> idata = model.fit(X,y) >>> x_pred = [] >>> prediction_data = pd.DataFrame({'input':x_pred}) >>> pred_mean = model.predict(prediction_data) diff --git a/tests/mmm/test_base.py b/tests/mmm/test_base.py index 55a643b2..5381aeff 100644 --- a/tests/mmm/test_base.py +++ b/tests/mmm/test_base.py @@ -58,7 +58,7 @@ def __init__(self, *args, **kwargs): def build_model(*args, **kwargs): pass - def generate_and_preprocess_model_data(self, X, y): + def _generate_and_preprocess_model_data(self, X, y): self.validate("X", X) self.validate("y", y) self.preprocessed_data["X"] = self.preprocess("X", X) @@ -135,7 +135,7 @@ def test_init( validate_channel_columns.configure_mock(_tags={"validation_X": True}) validate_date_col.configure_mock(_tags={"validation_X": True}) validate_target.configure_mock(_tags={"validation_y": True}) - toy_mmm.generate_and_preprocess_model_data(toy_X, toy_y) + toy_mmm._generate_and_preprocess_model_data(toy_X, toy_y) pd.testing.assert_frame_equal(toy_mmm.X, toy_X) pd.testing.assert_frame_equal(toy_mmm.preprocessed_data["X"], toy_X) pd.testing.assert_series_equal(toy_mmm.y, toy_y) @@ -166,7 +166,7 @@ def build_model(self, toy_X, *args, **kwargs): mu = intercept + slope pm.Normal("y", mu=mu, sigma=sigma) - def generate_and_preprocess_model_data(self, toy_X, toy_y): + def _generate_and_preprocess_model_data(self, toy_X, toy_y): self.validate("X", toy_X) self.validate("y", toy_y) self.preprocessed_data["X"] = self.preprocess("X", toy_X) diff --git a/tests/mmm/test_plotting.py b/tests/mmm/test_plotting.py index 1a88d1c8..35538bcb 100644 --- a/tests/mmm/test_plotting.py +++ b/tests/mmm/test_plotting.py @@ -76,6 +76,8 @@ class ToyMMM(BaseDelayedSaturatedMMM, MaxAbsScaleTarget): X=toy_X, y=toy_y, ) + mmm.sample_prior_predictive(toy_X, toy_y, extend_idata=True, combined=True) + mmm.sample_posterior_predictive(toy_X, extend_idata=True, combined=True) mmm._prior_predictive = mmm.prior_predictive mmm._fit_result = mmm.fit_result mmm._posterior_predictive = mmm.posterior_predictive diff --git a/tests/model_builder/test_model_builder.py b/tests/model_builder/test_model_builder.py index 34b08c90..f29070be 100644 --- a/tests/model_builder/test_model_builder.py +++ b/tests/model_builder/test_model_builder.py @@ -59,7 +59,28 @@ def fitted_model_instance(toy_X, toy_y): sampler_config=sampler_config, test_parameter="test_paramter", ) - model.fit(toy_X) + model.fit( + toy_X, + chains=1, + draws=100, + tune=100, + ) + return model + + +@pytest.fixture(scope="module") +def not_fitted_model_instance(toy_X, toy_y): + sampler_config = {"draws": 100, "tune": 100, "chains": 2, "target_accept": 0.95} + model_config = { + "a": {"loc": 0, "scale": 10, "dims": ("numbers",)}, + "b": {"loc": 0, "scale": 10}, + "obs_error": 2, + } + model = test_ModelBuilder( + model_config=model_config, + sampler_config=sampler_config, + test_parameter="test_paramter", + ) return model @@ -73,7 +94,7 @@ def __init__(self, model_config=None, sampler_config=None, test_parameter=None): def build_model(self, X: pd.DataFrame, y: pd.Series, model_config=None): coords = {"numbers": np.arange(len(X))} - self.generate_and_preprocess_model_data(X, y) + self._generate_and_preprocess_model_data(X, y) with pm.Model(coords=coords) as self.model: if model_config is None: model_config = self.default_model_config @@ -112,7 +133,7 @@ def _data_setter(self, x: pd.Series, y: pd.Series = None): def _serializable_model_config(self): return self.model_config - def generate_and_preprocess_model_data(self, X: pd.DataFrame, y: pd.Series): + def _generate_and_preprocess_model_data(self, X: pd.DataFrame, y: pd.Series): self.X = X self.y = y @@ -167,12 +188,18 @@ def test_save_without_fit_raises_runtime_error(): def test_empty_sampler_config_fit(toy_X, toy_y): sampler_config = {} model_builder = test_ModelBuilder(sampler_config=sampler_config) - model_builder.idata = model_builder.fit(X=toy_X, y=toy_y) + model_builder.idata = model_builder.fit( + X=toy_X, y=toy_y, chains=1, draws=100, tune=100 + ) assert model_builder.idata is not None assert "posterior" in model_builder.idata.groups() def test_fit(fitted_model_instance): + assert fitted_model_instance.idata is not None + assert "posterior" in fitted_model_instance.idata.groups() + assert fitted_model_instance.idata.posterior.dims["draw"] == 100 + prediction_data = pd.DataFrame( {"input": np.random.uniform(low=0, high=1, size=100)} ) @@ -185,7 +212,7 @@ def test_fit(fitted_model_instance): def test_fit_no_y(toy_X): model_builder = test_ModelBuilder() - model_builder.idata = model_builder.fit(X=toy_X) + model_builder.idata = model_builder.fit(X=toy_X, chains=1, draws=100, tune=100) assert model_builder.model is not None assert model_builder.idata is not None assert "posterior" in model_builder.idata.groups()