From c02d349653a3422053e05ab39dfeead6900af4e0 Mon Sep 17 00:00:00 2001 From: Juan Orduz Date: Thu, 18 Apr 2024 20:04:02 +0200 Subject: [PATCH] Add Flake8-BugBear and Bandit (Security!) (#637) * fix potential bugs * minor improvements * remove rule for nb * fix test * improve tests syntax * use stacklevel 2 for warnings * use stacklevel 1 for warnings as they are used in public methods * ignore B904 * undo change * ricardos feedback * use fit_posterior * Update pymc_marketing/clv/models/gamma_gamma.py Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com> * last fix XD --------- Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com> --- .../source/notebooks/clv/clv_quickstart.ipynb | 2 +- .../general/other_nuts_samplers.ipynb | 2 +- pymc_marketing/clv/models/basic.py | 8 ++-- pymc_marketing/clv/models/gamma_gamma.py | 17 +++++---- pymc_marketing/clv/utils.py | 2 +- pymc_marketing/mmm/base.py | 14 +++++-- pymc_marketing/mmm/budget_optimizer.py | 6 ++- pymc_marketing/mmm/delayed_saturated_mmm.py | 2 +- pymc_marketing/mmm/transformers.py | 2 +- pymc_marketing/model_builder.py | 3 +- pyproject.toml | 11 +++++- tests/clv/test_utils.py | 4 +- tests/mmm/test_lift_test.py | 2 +- tests/model_builder/test_model_builder.py | 37 ++++++++++--------- 14 files changed, 67 insertions(+), 45 deletions(-) diff --git a/docs/source/notebooks/clv/clv_quickstart.ipynb b/docs/source/notebooks/clv/clv_quickstart.ipynb index 386d7f32..2061d48c 100644 --- a/docs/source/notebooks/clv/clv_quickstart.ipynb +++ b/docs/source/notebooks/clv/clv_quickstart.ipynb @@ -968,7 +968,7 @@ "source": [ "ids = [841, 1981, 157, 1516]\n", "ax = az.plot_posterior(num_purchases.sel(customer_id=ids), grid=(2, 2))\n", - "for axi, id in zip(ax.ravel(), ids):\n", + "for axi, id in zip(ax.ravel(), ids, strict=False):\n", " axi.set_title(f\"Customer: {id}\", size=20)\n", "plt.suptitle(\"Expected number purchases in the next period\", fontsize=28, y=1.05);" ] diff --git a/docs/source/notebooks/general/other_nuts_samplers.ipynb b/docs/source/notebooks/general/other_nuts_samplers.ipynb index 2c181e9e..72590ed2 100644 --- a/docs/source/notebooks/general/other_nuts_samplers.ipynb +++ b/docs/source/notebooks/general/other_nuts_samplers.ipynb @@ -170,7 +170,7 @@ " for j, (idata, label) in enumerate(\n", " zip(\n", " [idata_blackjax, idata_nutpie, idata_numpyro],\n", - " [\"blackjax\", \"nutpie\", \"numpyro\"],\n", + " [\"blackjax\", \"nutpie\", \"numpyro\"], strict=False,\n", " )\n", " ):\n", " az.plot_posterior(\n", diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py index 997e96ff..f8a58972 100644 --- a/pymc_marketing/clv/models/basic.py +++ b/pymc_marketing/clv/models/basic.py @@ -57,7 +57,7 @@ def _add_fit_data_group(self, data: pd.DataFrame) -> None: category=UserWarning, message="The group fit_data is not defined in the InferenceData scheme", ) - assert self.idata is not None + assert self.idata is not None # noqa: S101 self.idata.add_groups(fit_data=data.to_xarray()) def fit( # type: ignore @@ -206,8 +206,8 @@ def thin_fit_result(self, keep_every: int): ) """ - self.fit_result # Raise Error if fit didn't happen yet - assert self.idata is not None + self.fit_result # noqa: B018 (Raise Error if fit didn't happen yet) + assert self.idata is not None # noqa: S101 new_idata = self.idata.isel(draw=slice(None, None, keep_every)).copy() return type(self)._build_with_idata(new_idata) @@ -237,7 +237,7 @@ def fit_result(self, res: az.InferenceData) -> None: if self.idata is None: self.idata = res elif "posterior" in self.idata: - warnings.warn("Overriding pre-existing fit_result") + warnings.warn("Overriding pre-existing fit_result", stacklevel=1) self.idata.posterior = res else: self.idata.posterior = res diff --git a/pymc_marketing/clv/models/gamma_gamma.py b/pymc_marketing/clv/models/gamma_gamma.py index 4247e7d0..8370d894 100644 --- a/pymc_marketing/clv/models/gamma_gamma.py +++ b/pymc_marketing/clv/models/gamma_gamma.py @@ -56,10 +56,11 @@ def expected_customer_spend( mean_transaction_value, frequency = to_xarray( customer_id, mean_transaction_value, frequency ) - assert self.idata is not None, "Model must be fitted first" - p = self.idata.posterior["p"] - q = self.idata.posterior["q"] - v = self.idata.posterior["v"] + posterior = self.fit_result + + p = posterior["p"] + q = posterior["q"] + v = posterior["v"] individual_weight = p * frequency / (p * frequency + q - 1) population_mean = v * p / (q - 1) @@ -89,10 +90,10 @@ def distribution_new_customer_spend( def expected_new_customer_spend(self) -> xarray.DataArray: """Expected transaction value for a new customer""" - assert self.idata is not None, "Model must be fitted first" - p_mean = self.idata.posterior["p"] - q_mean = self.idata.posterior["q"] - v_mean = self.idata.posterior["v"] + posterior = self.fit_result + p_mean = posterior["p"] + q_mean = posterior["q"] + v_mean = posterior["v"] # Closed form solution to the posterior of nu # Eq 3 from [1], p.3 diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py index 3e237675..4e5360a7 100644 --- a/pymc_marketing/clv/utils.py +++ b/pymc_marketing/clv/utils.py @@ -270,7 +270,7 @@ def _find_first_transactions( def clv_summary(*args, **kwargs): - warnings.warn("clv_summary was renamed to rfm_summary", UserWarning) + warnings.warn("clv_summary was renamed to rfm_summary", UserWarning, stacklevel=1) return rfm_summary(*args, **kwargs) diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py index 065d32ce..0236ae73 100644 --- a/pymc_marketing/mmm/base.py +++ b/pymc_marketing/mmm/base.py @@ -392,6 +392,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure: for arg, var_contribution in zip( ["control_columns", "yearly_seasonality"], ["control_contributions", "fourier_contributions"], + strict=True, ): if getattr(self, arg, None): contributions = self._format_model_contributions( @@ -413,6 +414,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure: "control_contribution", "fourier_contribution", ], + strict=False, ) ): if self.X is not None: @@ -688,7 +690,7 @@ def plot_budget_scenearios( # Plot all scenarios for i, (scenario, upper_bound, lower_bound) in enumerate( - zip(scenarios, upper_bounds, lower_bounds) + zip(scenarios, upper_bounds, lower_bounds, strict=False) ): color = f"C{i}" offset = i * bar_width - 0.4 + bar_width / 2 @@ -914,7 +916,9 @@ def optimize_channel_budget_for_maximum_contribution( "The 'parameters' argument (keyword-only) must be provided and non-empty." ) - warnings.warn("This budget allocator method is experimental", UserWarning) + warnings.warn( + "This budget allocator method is experimental", UserWarning, stacklevel=1 + ) return budget_allocator( method=method, @@ -947,7 +951,9 @@ def compute_channel_curve_optimization_parameters_original_scale( parameters for each channel based on the method used. """ warnings.warn( - "The curve optimization parameters method is experimental", UserWarning + "The curve optimization parameters method is experimental", + UserWarning, + stacklevel=1, ) channel_contributions = self.compute_channel_contribution_original_scale().mean( @@ -1044,7 +1050,7 @@ def legend_title_func(channel): axes_channels = ( zip(repeat(axes), channels_to_plot) if same_axes - else zip(np.ravel(axes), channels_to_plot) + else zip(np.ravel(axes), channels_to_plot, strict=False) ) for i, (ax, channel) in enumerate(axes_channels): diff --git a/pymc_marketing/mmm/budget_optimizer.py b/pymc_marketing/mmm/budget_optimizer.py index 69418283..a616d2bb 100644 --- a/pymc_marketing/mmm/budget_optimizer.py +++ b/pymc_marketing/mmm/budget_optimizer.py @@ -101,7 +101,7 @@ def objective_distribution( sum_contributions = 0.0 - for channel, budget in zip(channels, x): + for channel, budget in zip(channels, x, strict=False): if method == "michaelis-menten": L, k = parameters[channel] sum_contributions += michaelis_menten(budget, L, k) @@ -182,7 +182,9 @@ def optimize_budget_distribution( constraints=constraints, ) - return {channel: budget for channel, budget in zip(channels, result.x)} + return { + channel: budget for channel, budget in zip(channels, result.x, strict=False) + } def budget_allocator( diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py index 3ef533ee..9e8bf177 100644 --- a/pymc_marketing/mmm/delayed_saturated_mmm.py +++ b/pymc_marketing/mmm/delayed_saturated_mmm.py @@ -652,7 +652,7 @@ def _data_setter( try: new_channel_data = X[self.channel_columns].to_numpy() except KeyError as e: - raise RuntimeError("New data must contain channel_data!", e) + raise RuntimeError("New data must contain channel_data!") from e def identity(x): return x diff --git a/pymc_marketing/mmm/transformers.py b/pymc_marketing/mmm/transformers.py index a7f54410..08874884 100644 --- a/pymc_marketing/mmm/transformers.py +++ b/pymc_marketing/mmm/transformers.py @@ -92,7 +92,7 @@ def batched_convolution( if l_max is None: try: l_max = w.shape[-1].eval() - except Exception: + except Exception: # noqa: S110 pass # Get the broadcast shapes of x and w but ignoring their last dimension. # The last dimension of x is the "time" axis, which doesn't get broadcast diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py index a8240ded..2ad74d81 100644 --- a/pymc_marketing/model_builder.py +++ b/pymc_marketing/model_builder.py @@ -486,7 +486,8 @@ def fit( X_df = pd.DataFrame(X, columns=X.columns) combined_data = pd.concat([X_df, y_df], axis=1) - assert all(combined_data.columns), "All columns must have non-empty names" + if not all(combined_data.columns): + raise ValueError("All columns must have non-empty names") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", diff --git a/pyproject.toml b/pyproject.toml index a88c459f..141d9940 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,12 +59,21 @@ repository = "https://github.com/pymc-labs/pymc-marketing" #changelog = "" [tool.ruff.lint] -select = ["E", "F", "I", "W", "UP", "RUF"] +select = ["B", "E", "F", "I", "RUF", "S", "UP", "W"] ignore = [ + "B904", # raise-without-from-inside-except "RUF001", # String contains ambiguous character (such as Greek letters) "RUF002", # Docstring contains ambiguous character (such as Greek letters) "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` ] +[tool.ruff.lint.per-file-ignores] +"docs/source/notebooks/*" = [ + "B018", # Checks for "useless" expressions. Not useful for notebooks. +] +"tests/*" = [ + "B018", # Checks for "useless" expressions. This is useful for tests. + "S101", # Use of assert +] [tool.ruff.lint.pycodestyle] max-line-length = 120 diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py index 798f8f6c..83bcf815 100644 --- a/tests/clv/test_utils.py +++ b/tests/clv/test_utils.py @@ -31,14 +31,14 @@ def test_to_xarray(): np.testing.assert_array_equal(new_x.coords["customer_id"], customer_id) np.testing.assert_array_equal(x, new_x.values) - for old, new in zip((x, y, z), to_xarray(customer_id, x, y, z)): + for old, new in zip((x, y, z), to_xarray(customer_id, x, y, z), strict=False): assert isinstance(new, xarray.DataArray) assert new.dims == ("customer_id",) np.testing.assert_array_equal(new.coords["customer_id"], customer_id) np.testing.assert_array_equal(old, new.values) new_y = to_xarray(customer_id, y, dim="test_dim") - new_y.dims == ("test_dim",) + assert new_y.dims == ("test_dim",) np.testing.assert_array_equal(new_y.coords["test_dim"], customer_id) diff --git a/tests/mmm/test_lift_test.py b/tests/mmm/test_lift_test.py index e4d511e7..5bc6ea26 100644 --- a/tests/mmm/test_lift_test.py +++ b/tests/mmm/test_lift_test.py @@ -98,7 +98,7 @@ def test_lift_tests_missing(df_lift_tests) -> None: with pytest.raises(MissingLiftTestError) as err: lift_test_indices(df_lift_tests, model) - err.value.missing_values.tolist() == [0] + assert err.value.missing_values.tolist() == [0] @pytest.fixture diff --git a/tests/model_builder/test_model_builder.py b/tests/model_builder/test_model_builder.py index bda63b95..543ee242 100644 --- a/tests/model_builder/test_model_builder.py +++ b/tests/model_builder/test_model_builder.py @@ -25,26 +25,24 @@ from pymc_marketing.model_builder import ModelBuilder -rng = np.random.default_rng(42) - @pytest.fixture(scope="module") def toy_X(): x = np.linspace(start=0, stop=1, num=100) - X = pd.DataFrame({"input": x}) - return X + return pd.DataFrame({"input": x}) @pytest.fixture(scope="module") def toy_y(toy_X): + rng = np.random.default_rng(42) y = 5 * toy_X["input"] + 3 - y = y + np.random.normal(0, 1, size=len(toy_X)) + y = y + rng.normal(0, 1, size=len(toy_X)) y = pd.Series(y, name="output") return y @pytest.fixture(scope="module") -def fitted_model_instance(toy_X, toy_y): +def fitted_model_instance(toy_X): sampler_config = { "draws": 100, "tune": 100, @@ -71,19 +69,18 @@ def fitted_model_instance(toy_X, toy_y): @pytest.fixture(scope="module") -def not_fitted_model_instance(toy_X, toy_y): +def not_fitted_model_instance(): sampler_config = {"draws": 100, "tune": 100, "chains": 2, "target_accept": 0.95} model_config = { "a": {"loc": 0, "scale": 10, "dims": ("numbers",)}, "b": {"loc": 0, "scale": 10}, "obs_error": 2, } - model = ModelBuilderTest( + return ModelBuilderTest( model_config=model_config, sampler_config=sampler_config, test_parameter="test_paramter", ) - return model class ModelBuilderTest(ModelBuilder): @@ -177,12 +174,13 @@ def test_save_input_params(fitted_model_instance): def test_save_load(fitted_model_instance): + rng = np.random.default_rng(42) temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) fitted_model_instance.save(temp.name) test_builder2 = ModelBuilderTest.load(temp.name) assert fitted_model_instance.idata.groups() == test_builder2.idata.groups() assert fitted_model_instance.id == test_builder2.id - x_pred = np.random.uniform(low=0, high=1, size=100) + x_pred = rng.uniform(low=0, high=1, size=100) prediction_data = pd.DataFrame({"input": x_pred}) pred1 = fitted_model_instance.predict(prediction_data["input"]) pred2 = test_builder2.predict(prediction_data["input"]) @@ -213,21 +211,23 @@ def test_empty_sampler_config_fit(toy_X, toy_y): def test_fit(fitted_model_instance): + rng = np.random.default_rng(42) assert fitted_model_instance.idata is not None assert "posterior" in fitted_model_instance.idata.groups() assert fitted_model_instance.idata.posterior.dims["draw"] == 100 - prediction_data = pd.DataFrame( - {"input": np.random.uniform(low=0, high=1, size=100)} - ) + prediction_data = pd.DataFrame({"input": rng.uniform(low=0, high=1, size=100)}) fitted_model_instance.predict(prediction_data["input"]) post_pred = fitted_model_instance.sample_posterior_predictive( prediction_data["input"], extend_idata=True, combined=True ) - post_pred[fitted_model_instance.output_var].shape[0] == prediction_data.input.shape + assert ( + post_pred[fitted_model_instance.output_var].shape[0] + == prediction_data.input.shape[0] + ) -def test_fit_no_y(toy_X): +def test_fit_no_t(toy_X): model_builder = ModelBuilderTest() model_builder.idata = model_builder.fit(X=toy_X, chains=1, draws=100, tune=100) assert model_builder.model is not None @@ -240,7 +240,8 @@ def test_fit_no_y(toy_X): reason="Permissions for temp files not granted on windows CI.", ) def test_predict(fitted_model_instance): - x_pred = np.random.uniform(low=0, high=1, size=100) + rng = np.random.default_rng(42) + x_pred = rng.uniform(low=0, high=1, size=100) prediction_data = pd.DataFrame({"input": x_pred}) pred = fitted_model_instance.predict(prediction_data["input"]) # Perform elementwise comparison using numpy @@ -250,8 +251,9 @@ def test_predict(fitted_model_instance): @pytest.mark.parametrize("combined", [True, False]) def test_sample_posterior_predictive(fitted_model_instance, combined): + rng = np.random.default_rng(42) n_pred = 100 - x_pred = np.random.uniform(low=0, high=1, size=n_pred) + x_pred = rng.uniform(low=0, high=1, size=n_pred) prediction_data = pd.DataFrame({"input": x_pred}) pred = fitted_model_instance.sample_posterior_predictive( prediction_data["input"], combined=combined, extend_idata=True @@ -294,6 +296,7 @@ def test_id(): def test_sample_xxx_predictive_keeps_second( fitted_model_instance, toy_X, name: str ) -> None: + rng = np.random.default_rng(42) method_name = f"sample_{name}" method = getattr(fitted_model_instance, method_name)