pymc-labs · michaelraczycki · Sep 1, 2023 · Aug 19, 2023 · Aug 19, 2023 · Aug 20, 2023
diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py
@@ -293,7 +293,7 @@ def fit_summary(self, **kwargs):
     def output_var(self):
         pass
 
-    def generate_and_preprocess_model_data(
+    def _generate_and_preprocess_model_data(
         self,
         X: Union[pd.DataFrame, pd.Series],
         y: Union[pd.Series, np.ndarray[Any, Any]],

diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py
@@ -42,7 +42,7 @@ def __init__(
         **kwargs,
     ) -> None:
         self.X: Optional[pd.DataFrame] = None
-        self.y: Optional[pd.Series] = None
+        self.y: Optional[Union[pd.Series, np.ndarray]] = None
         self.date_column: str = date_column
         self.channel_columns: Union[List[str], Tuple[str]] = channel_columns
         self.n_channel: int = len(channel_columns)
@@ -69,8 +69,8 @@ def methods(self) -> List[Any]:
     def validation_methods(
         self,
     ) -> Tuple[
-        List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]],
-        List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series]], None]],
+        List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]],
+        List[Callable[["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]], None]],
     ]:
         """
         A property that provides validation methods for features ("X") and the target variable ("y").
@@ -98,7 +98,9 @@ def validation_methods(
             ],
         )
 
-    def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None:
+    def validate(
+        self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray]
+    ) -> None:
         """
         Validates the input data based on the specified target type.
 
@@ -110,7 +112,7 @@ def validate(self, target: str, data: Union[pd.DataFrame, pd.Series]) -> None:
         target : str
             The type of target to be validated.
             Expected values are "X" for features and "y" for the target variable.
-        data : Union[pd.DataFrame, pd.Series]
+        data : Union[pd.DataFrame, pd.Series, np.ndarray]
             The input data to be validated.
 
         Raises
@@ -134,14 +136,14 @@ def preprocessing_methods(
     ) -> Tuple[
         List[
             Callable[
-                ["BaseMMM", Union[pd.DataFrame, pd.Series]],
-                Union[pd.DataFrame, pd.Series],
+                ["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]],
+                Union[pd.DataFrame, pd.Series, np.ndarray],
             ]
         ],
         List[
             Callable[
-                ["BaseMMM", Union[pd.DataFrame, pd.Series]],
-                Union[pd.DataFrame, pd.Series],
+                ["BaseMMM", Union[pd.DataFrame, pd.Series, np.ndarray]],
+                Union[pd.DataFrame, pd.Series, np.ndarray],
             ]
         ],
     ]:
@@ -171,8 +173,8 @@ def preprocessing_methods(
         )
 
     def preprocess(
-        self, target: str, data: Union[pd.DataFrame, pd.Series]
-    ) -> Union[pd.DataFrame, pd.Series]:
+        self, target: str, data: Union[pd.DataFrame, pd.Series, np.ndarray]
+    ) -> Union[pd.DataFrame, pd.Series, np.ndarray]:
         """
         Preprocess the provided data according to the specified target.
 
@@ -184,12 +186,12 @@ def preprocess(
         target : str
             Indicates whether the data represents features ("X") or the target variable ("y").
 
-        data : pd.DataFrame
+        data : Union[pd.DataFrame, pd.Series, np.ndarray]
             The data to be preprocessed.
 
         Returns
         -------
-        Union[pd.DataFrame, pd.Series]
+        Union[pd.DataFrame, pd.Series, np.ndarray]
             The preprocessed data.
 
         Raises

diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py
@@ -81,10 +81,11 @@ def default_sampler_config(self) -> Dict:
 
     @property
     def output_var(self):
+        """Defines target variable for the model"""
         return "y"
 
-    def generate_and_preprocess_model_data(  # type: ignore
-        self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
+    def _generate_and_preprocess_model_data(  # type: ignore
+        self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray]
     ) -> None:
         """
         Applies preprocessing to the data before fitting the model.
@@ -93,8 +94,8 @@ def generate_and_preprocess_model_data(  # type: ignore
 
         Parameters
         ----------
-        X : array, shape (n_obs, n_features)
-        y : array, shape (n_obs,)
+        X : Union[pd.DataFrame, pd.Series], shape (n_obs, n_features)
+        y : Union[pd.Series, np.ndarray], shape (n_obs,)
         """
         date_data = X[self.date_column]
         channel_data = X[self.channel_columns]
@@ -126,11 +127,11 @@ def generate_and_preprocess_model_data(  # type: ignore
             self.validate("X", X_data)
             self.validate("y", y)
         self.preprocessed_data: Dict[str, Union[pd.DataFrame, pd.Series]] = {
-            "X": self.preprocess("X", X_data),
-            "y": self.preprocess("y", y),
+            "X": self.preprocess("X", X_data),  # type: ignore
+            "y": self.preprocess("y", y),  # type: ignore
         }
         self.X: pd.DataFrame = X_data
-        self.y: pd.Series = y
+        self.y: Union[pd.Series, np.ndarray] = y
 
     def _save_input_params(self, idata) -> None:
         """Saves input parameters to the attrs of idata."""
@@ -144,11 +145,35 @@ def _save_input_params(self, idata) -> None:
     def build_model(
         self,
         X: pd.DataFrame,
-        y: pd.Series,
+        y: Union[pd.Series, np.ndarray],
         **kwargs,
     ) -> None:
+        """
+        Builds a probabilistic model using PyMC for marketing mix modeling.
+
+        The model incorporates channels, control variables, and Fourier components, applying
+        adstock and saturation transformations to the channel data. The final model is
+        constructed with multiple factors contributing to the response variable.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            The input data for the model, which should include columns for channels,
+            control variables (if applicable), and Fourier components (if applicable).
+
+        y : Union[pd.Series, np.ndarray]
+            The target/response variable for the modeling.
+
+        **kwargs : dict
+            Additional keyword arguments that might be required by underlying methods or utilities.
+
+        Attributes Set
+        ---------------
+        model : pm.Model
+            The PyMC model object containing all the defined stochastic and deterministic variables.
+        """
         model_config = self.model_config
-        self.generate_and_preprocess_model_data(X, y)
+        self._generate_and_preprocess_model_data(X, y)
         with pm.Model(coords=self.model_coords) as self.model:
             channel_data_ = pm.MutableData(
                 name="channel_data",

diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py
@@ -50,7 +50,7 @@ class ModelBuilder(ABC):
     version = "None"
 
     X: Optional[pd.DataFrame] = None
-    y: Optional[pd.Series] = None
+    y: Optional[Union[pd.Series, np.ndarray]] = None
 
     def __init__(
         self,
@@ -195,28 +195,28 @@ def default_sampler_config(self) -> Dict:
         raise NotImplementedError
 
     @abstractmethod
-    def generate_and_preprocess_model_data(
-        self,
-        X: Union[pd.DataFrame, pd.Series],
-        y: Union[pd.Series, np.ndarray[Any, Any]],
+    def _generate_and_preprocess_model_data(
+        self, X: Union[pd.DataFrame, pd.Series], y: np.ndarray
     ) -> None:
         """
         Applies preprocessing to the data before fitting the model.
         if validate is True, it will check if the data is valid for the model.
         sets self.model_coords based on provided dataset
 
+        In case of optional parameters being passed into the model, this method should implement the conditional
+        logic responsible for correct handling of the optional parameters, and including them into the dataset.
+
         Parameters:
         X : array, shape (n_obs, n_features)
         y : array, shape (n_obs,)
 
         Examples
         --------
         >>>     @classmethod
-        >>>     def generate_and_preprocess_model_data(self, X, y):
-        >>>         x = np.linspace(start=1, stop=50, num=100)
-        >>>         y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 +  np.random.rand(100)*6.4
-        >>>         X = pd.DataFrame(x, columns=['x'])
-        >>>         y = pd.Series(y, name='y')
+        >>>     def _generate_and_preprocess_model_data(self, X, y):
+                    coords = {
+                        'x_dim': X.dim_variable,
+                    } #only include if applicable for your model
         >>>         self.X = X
         >>>         self.y = y
 
@@ -231,7 +231,7 @@ def generate_and_preprocess_model_data(
     def build_model(
         self,
         X: pd.DataFrame,
-        y: pd.Series,
+        y: Union[pd.Series, np.ndarray],
         **kwargs,
     ) -> None:
         """
@@ -246,7 +246,7 @@ def build_model(
             only contain the necessary data columns, not the entire available dataset, as this
             will be encoded into the data used to recreate the model.
 
-        y : pd.Series
+        y : Union[pd.Series, np.ndarray]
             The target data for the model. This should be a Series representing the output
             or dependent variable for the model.
 
@@ -268,49 +268,6 @@ def build_model(
         """
         raise NotImplementedError
 
-    def sample_model(self, **kwargs):
-        """
-        Sample from the PyMC model.
-
-        Parameters
-        ----------
-        **kwargs : dict
-            Additional keyword arguments to pass to the PyMC sampler.
-
-        Returns
-        -------
-        xarray.Dataset
-            The PyMC samples dataset.
-
-        Raises
-        ------
-        RuntimeError
-            If the PyMC model hasn't been built yet.
-
-        Examples
-        --------
-        >>> self.build_model()
-        >>> idata = self.sample_model(draws=100, tune=10)
-        >>> assert isinstance(idata, xr.Dataset)
-        >>> assert "posterior" in idata
-        >>> assert "prior" in idata
-        >>> assert "observed_data" in idata
-        >>> assert "log_likelihood" in idata
-        """
-        if self.model is None:
-            raise RuntimeError(
-                "The model hasn't been built yet, call .build_model() first or call .fit() instead."
-            )
-
-        with self.model:
-            sampler_args = {**self.sampler_config, **kwargs}
-            idata = pm.sample(**sampler_args)
-            idata.extend(pm.sample_prior_predictive())
-            idata.extend(pm.sample_posterior_predictive(idata))
-
-        idata = self.set_idata_attrs(idata)
-        return idata
-
     def set_idata_attrs(self, idata=None):
         """
         Set attributes on an InferenceData object.
@@ -334,11 +291,6 @@ def set_idata_attrs(self, idata=None):
         >>> model = MyModel(ModelBuilder)
         >>> idata = az.InferenceData(your_dataset)
         >>> model.set_idata_attrs(idata=idata)
-        >>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual
-        >>> assert "model_type" in idata.attrs
-        >>> assert "version" in idata.attrs
-        >>> assert "sampler_config" in idata.attrs
-        >>> assert "model_config" in idata.attrs
         """
         if idata is None:
             idata = self.idata
@@ -381,7 +333,7 @@ def save(self, fname: str) -> None:
         >>>     def __init__(self):
         >>>         super().__init__()
         >>> model = MyModel()
-        >>> model.fit(data)
+        >>> model.fit(X,y)
         >>> model.save('model_results.nc')  # This will call the overridden method in MyModel
         """
         if self.idata is not None and "posterior" in self.idata:
@@ -468,7 +420,7 @@ def fit(
         y: Optional[Union[pd.Series, np.ndarray]] = None,
         progressbar: bool = True,
         predictor_names: Optional[List[str]] = None,
-        random_seed: RandomState = None,
+        random_seed: Optional[RandomState] = None,
         **kwargs: Any,
     ) -> az.InferenceData:
         """
@@ -484,10 +436,10 @@ def fit(
             The target values (real numbers).
         progressbar : bool
             Specifies whether the fit progressbar should be displayed
-        predictor_names: List[str] = None,
+        predictor_names: Optional[List[str]] = None,
             Allows for custom naming of predictors given in a form of 2dArray
             allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2...
-        random_seed : RandomState
+        random_seed : Optional[RandomState]
             Provides sampler with initial random seed for obtaining reproducible samples
         **kwargs : Any
             Custom sampler settings can be provided in form of keyword arguments.
@@ -499,7 +451,7 @@ def fit(
         Examples
         --------
         >>> model = MyModel()
-        >>> idata = model.fit(data)
+        >>> idata = model.fit(X,y)
         Auto-assigning NUTS sampler...
         Initializing NUTS using jitter+adapt_diag...
         """
@@ -508,7 +460,7 @@ def fit(
         if y is None:
             y = np.zeros(X.shape[0])
         y_df = pd.DataFrame({self.output_var: y})
-        self.generate_and_preprocess_model_data(X, y_df.values.flatten())
+        self._generate_and_preprocess_model_data(X, y_df.values.flatten())
         if self.X is None or self.y is None:
             raise ValueError("X and y must be set before calling build_model!")
         self.build_model(self.X, self.y)
@@ -517,7 +469,12 @@ def fit(
         sampler_config["progressbar"] = progressbar
         sampler_config["random_seed"] = random_seed
         sampler_config.update(**kwargs)
-        self.idata = self.sample_model(**sampler_config)
+
+        sampler_config.update(**kwargs)
+        if self.model is not None:
+            with self.model:
+                sampler_args = {**self.sampler_config, **kwargs}
+                self.idata = pm.sample(**sampler_args)
 
         X_df = pd.DataFrame(X, columns=X.columns)
         combined_data = pd.concat([X_df, y_df], axis=1)
@@ -529,7 +486,7 @@ def fit(
                 message="The group fit_data is not defined in the InferenceData scheme",
             )
             self.idata.add_groups(fit_data=combined_data.to_xarray())  # type: ignore
-
+        self.set_idata_attrs(self.idata)
         return self.idata  # type: ignore
 
     def predict(
@@ -558,7 +515,7 @@ def predict(
         Examples
         --------
         >>> model = MyModel()
-        >>> idata = model.fit(data)
+        >>> idata = model.fit(X,y)
         >>> x_pred = []
         >>> prediction_data = pd.DataFrame({'input':x_pred})
         >>> pred_mean = model.predict(prediction_data)