Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add datasetter for date and fourier #405

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 39 additions & 16 deletions pymc_marketing/mmm/delayed_saturated_mmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pymc_marketing.mmm.base import MMM
from pymc_marketing.mmm.preprocessing import MaxAbsScaleChannels, MaxAbsScaleTarget
from pymc_marketing.mmm.transformers import geometric_adstock, logistic_saturation
from pymc_marketing.mmm.utils import generate_fourier_modes
from pymc_marketing.mmm.utils import generate_yearly_fourier_modes
from pymc_marketing.mmm.validating import ValidateControlColumns

__all__ = ["DelayedSaturatedMMM"]
Expand Down Expand Up @@ -85,7 +85,7 @@ def output_var(self):
return "y"

def _generate_and_preprocess_model_data( # type: ignore
self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray]
self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray]
) -> None:
"""
Applies preprocessing to the data before fitting the model.
Expand All @@ -94,7 +94,7 @@ def _generate_and_preprocess_model_data( # type: ignore

Parameters
----------
X : Union[pd.DataFrame, pd.Series], shape (n_obs, n_features)
X : pd.DataFrame, shape (n_obs, n_features)
y : Union[pd.Series, np.ndarray], shape (n_obs,)
"""
date_data = X[self.date_column]
Expand Down Expand Up @@ -326,7 +326,7 @@ def default_model_config(self) -> Dict:
}
return model_config

def _get_fourier_models_data(self, X) -> pd.DataFrame:
def _get_fourier_models_data(self, X: pd.DataFrame) -> pd.DataFrame:
"""Generates fourier modes to model seasonality.

References
Expand All @@ -338,10 +338,9 @@ def _get_fourier_models_data(self, X) -> pd.DataFrame:
date_data: pd.Series = pd.to_datetime(
arg=X[self.date_column], format="%Y-%m-%d"
)
periods: npt.NDArray[np.float_] = date_data.dt.dayofyear.to_numpy() / 365.25
return generate_fourier_modes(
periods=periods,
n_order=self.yearly_seasonality,

return generate_yearly_fourier_modes(
dayofyear=date_data.dt.dayofyear.to_numpy(), n_order=self.yearly_seasonality
)

def channel_contributions_forward_pass(
Expand Down Expand Up @@ -486,18 +485,42 @@ def _data_setter(
"""
new_channel_data: Optional[np.ndarray] = None

if isinstance(X, pd.DataFrame):
def from_frame_or_array(
X: Union[pd.DataFrame, np.ndarray], columns, handle_frame_func=None
) -> np.ndarray:
if not isinstance(X, (pd.DataFrame, np.ndarray)):
raise TypeError("X must be either a pandas DataFrame or a numpy array")

if isinstance(X, np.ndarray):
return X
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should support ndarray when we want to select different sets of columns. Unless we want to support the case where only costs are added. i.e
isinstance(X, np.ndarray) and self.control_columns is None and self.yearly_seasonality is None

I think that this can simplify the code heavily


if handle_frame_func is None:

def handle_frame_func(X):
raise RuntimeError(f"New data must contain {columns}!")

try:
new_channel_data = X[self.channel_columns].to_numpy()
except KeyError as e:
raise RuntimeError("New data must contain channel_data!", e)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataFrames already raise KeyErrors can that be used instead? What's more important: switching to RuntimeError or having an informative error message

elif isinstance(X, np.ndarray):
new_channel_data = X
else:
raise TypeError("X must be either a pandas DataFrame or a numpy array")
return X[columns].to_numpy()
except KeyError:
return handle_frame_func(X)

new_channel_data = from_frame_or_array(X, columns=self.channel_columns)
data: Dict[str, Union[np.ndarray, Any]] = {"channel_data": new_channel_data}

if self.control_columns is not None:
new_control_data = from_frame_or_array(X, columns=self.control_columns)
data["control_data"] = new_control_data

if self.yearly_seasonality is not None:

def handle_frame_func(X):
return self._get_fourier_models_data(X).to_numpy()

new_fourier_data = from_frame_or_array(
X, columns=self.fourier_columns, handle_frame_func=handle_frame_func
)
data["fourier_data"] = new_fourier_data

if y is not None:
if isinstance(y, pd.Series):
data[
Expand Down
2 changes: 1 addition & 1 deletion pymc_marketing/mmm/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class MaxAbsScaleTarget:

@preprocessing_method_y
def max_abs_scale_target_data(self, data: pd.Series) -> pd.Series:
target_vector = data.reshape(-1, 1)
wd60622 marked this conversation as resolved.
Show resolved Hide resolved
target_vector = data.to_numpy().reshape(-1, 1)
transformers = [("scaler", MaxAbsScaler())]
pipeline = Pipeline(steps=transformers)
self.target_transformer: Pipeline = pipeline.fit(X=target_vector)
Expand Down
2 changes: 1 addition & 1 deletion pymc_marketing/mmm/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def batched_convolution(x, w, axis: int = 0):


def geometric_adstock(
x, alpha: float = 0.0, l_max: int = 12, normalize: bool = False, axis: int = 0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getting a mypy issue

x, alpha=0.0, l_max: int = 12, normalize: bool = False, axis: int = 0
):
"""Geometric adstock transformation.

Expand Down
26 changes: 26 additions & 0 deletions pymc_marketing/mmm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,32 @@
from scipy.optimize import curve_fit, minimize_scalar


def generate_yearly_fourier_modes(
dayofyear: npt.NDArray[np.float_], n_order: int
) -> pd.DataFrame:
"""Generate Fourier modes for yearly seasonality.

Parameters
----------
dayofyear : array-like of float
Input array denoting the day of year.
n_order : int
Maximum order of Fourier modes.

Returns
-------
pd.DataFrame
Fourier modes (sin and cos with different frequencies) as columns in a dataframe.

"""
DAYS_OF_YEAR = 365.25
periods: npt.NDArray[np.float_] = dayofyear / DAYS_OF_YEAR
return generate_fourier_modes(
periods=periods,
n_order=n_order,
)


def generate_fourier_modes(
periods: npt.NDArray[np.float_], n_order: int
) -> pd.DataFrame:
Expand Down
Loading