Add Flake8-BugBear and Bandit (Security!) (#637)

* fix potential bugs * minor improvements * remove rule for nb * fix test * improve tests syntax * use stacklevel 2 for warnings * use stacklevel 1 for warnings as they are used in public methods * ignore B904 * undo change * ricardos feedback * use fit_posterior * Update pymc_marketing/clv/models/gamma_gamma.py Co-authored-by: Ricardo Vieira <[email protected]> * last fix XD --------- Co-authored-by: Ricardo Vieira <[email protected]>
pymc-labs · Apr 18, 2024 · c02d349 · c02d349
1 parent 582d2ed
commit c02d349
Show file tree

Hide file tree

Showing 14 changed files with 67 additions and 45 deletions.
diff --git a/docs/source/notebooks/clv/clv_quickstart.ipynb b/docs/source/notebooks/clv/clv_quickstart.ipynb
@@ -968,7 +968,7 @@
    "source": [
     "ids = [841, 1981, 157, 1516]\n",
     "ax = az.plot_posterior(num_purchases.sel(customer_id=ids), grid=(2, 2))\n",
-    "for axi, id in zip(ax.ravel(), ids):\n",
+    "for axi, id in zip(ax.ravel(), ids, strict=False):\n",
     "    axi.set_title(f\"Customer: {id}\", size=20)\n",
     "plt.suptitle(\"Expected number purchases in the next period\", fontsize=28, y=1.05);"
    ]

diff --git a/docs/source/notebooks/general/other_nuts_samplers.ipynb b/docs/source/notebooks/general/other_nuts_samplers.ipynb
@@ -170,7 +170,7 @@
     "    for j, (idata, label) in enumerate(\n",
     "        zip(\n",
     "            [idata_blackjax, idata_nutpie, idata_numpyro],\n",
-    "            [\"blackjax\", \"nutpie\", \"numpyro\"],\n",
+    "            [\"blackjax\", \"nutpie\", \"numpyro\"], strict=False,\n",
     "        )\n",
     "    ):\n",
     "        az.plot_posterior(\n",

diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py
@@ -57,7 +57,7 @@ def _add_fit_data_group(self, data: pd.DataFrame) -> None:
                 category=UserWarning,
                 message="The group fit_data is not defined in the InferenceData scheme",
             )
-            assert self.idata is not None
+            assert self.idata is not None  # noqa: S101
             self.idata.add_groups(fit_data=data.to_xarray())
 
     def fit(  # type: ignore
@@ -206,8 +206,8 @@ def thin_fit_result(self, keep_every: int):
             )
 
         """
-        self.fit_result  # Raise Error if fit didn't happen yet
-        assert self.idata is not None
+        self.fit_result  # noqa: B018 (Raise Error if fit didn't happen yet)
+        assert self.idata is not None  # noqa: S101
         new_idata = self.idata.isel(draw=slice(None, None, keep_every)).copy()
         return type(self)._build_with_idata(new_idata)
 
@@ -237,7 +237,7 @@ def fit_result(self, res: az.InferenceData) -> None:
         if self.idata is None:
             self.idata = res
         elif "posterior" in self.idata:
-            warnings.warn("Overriding pre-existing fit_result")
+            warnings.warn("Overriding pre-existing fit_result", stacklevel=1)
             self.idata.posterior = res
         else:
             self.idata.posterior = res

diff --git a/pymc_marketing/clv/models/gamma_gamma.py b/pymc_marketing/clv/models/gamma_gamma.py
@@ -56,10 +56,11 @@ def expected_customer_spend(
         mean_transaction_value, frequency = to_xarray(
             customer_id, mean_transaction_value, frequency
         )
-        assert self.idata is not None, "Model must be fitted first"
-        p = self.idata.posterior["p"]
-        q = self.idata.posterior["q"]
-        v = self.idata.posterior["v"]
+        posterior = self.fit_result
+
+        p = posterior["p"]
+        q = posterior["q"]
+        v = posterior["v"]
 
         individual_weight = p * frequency / (p * frequency + q - 1)
         population_mean = v * p / (q - 1)
@@ -89,10 +90,10 @@ def distribution_new_customer_spend(
     def expected_new_customer_spend(self) -> xarray.DataArray:
         """Expected transaction value for a new customer"""
 
-        assert self.idata is not None, "Model must be fitted first"
-        p_mean = self.idata.posterior["p"]
-        q_mean = self.idata.posterior["q"]
-        v_mean = self.idata.posterior["v"]
+        posterior = self.fit_result
+        p_mean = posterior["p"]
+        q_mean = posterior["q"]
+        v_mean = posterior["v"]
 
         # Closed form solution to the posterior of nu
         # Eq 3 from [1], p.3

diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py
@@ -270,7 +270,7 @@ def _find_first_transactions(
 
 
 def clv_summary(*args, **kwargs):
-    warnings.warn("clv_summary was renamed to rfm_summary", UserWarning)
+    warnings.warn("clv_summary was renamed to rfm_summary", UserWarning, stacklevel=1)
     return rfm_summary(*args, **kwargs)
 
 

diff --git a/pymc_marketing/mmm/base.py b/pymc_marketing/mmm/base.py
@@ -392,6 +392,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure:
         for arg, var_contribution in zip(
             ["control_columns", "yearly_seasonality"],
             ["control_contributions", "fourier_contributions"],
+            strict=True,
         ):
             if getattr(self, arg, None):
                 contributions = self._format_model_contributions(
@@ -413,6 +414,7 @@ def plot_components_contributions(self, **plt_kwargs: Any) -> plt.Figure:
                     "control_contribution",
                     "fourier_contribution",
                 ],
+                strict=False,
             )
         ):
             if self.X is not None:
@@ -688,7 +690,7 @@ def plot_budget_scenearios(
 
         # Plot all scenarios
         for i, (scenario, upper_bound, lower_bound) in enumerate(
-            zip(scenarios, upper_bounds, lower_bounds)
+            zip(scenarios, upper_bounds, lower_bounds, strict=False)
         ):
             color = f"C{i}"
             offset = i * bar_width - 0.4 + bar_width / 2
@@ -914,7 +916,9 @@ def optimize_channel_budget_for_maximum_contribution(
                 "The 'parameters' argument (keyword-only) must be provided and non-empty."
             )
 
-        warnings.warn("This budget allocator method is experimental", UserWarning)
+        warnings.warn(
+            "This budget allocator method is experimental", UserWarning, stacklevel=1
+        )
 
         return budget_allocator(
             method=method,
@@ -947,7 +951,9 @@ def compute_channel_curve_optimization_parameters_original_scale(
             parameters for each channel based on the method used.
         """
         warnings.warn(
-            "The curve optimization parameters method is experimental", UserWarning
+            "The curve optimization parameters method is experimental",
+            UserWarning,
+            stacklevel=1,
         )
 
         channel_contributions = self.compute_channel_contribution_original_scale().mean(
@@ -1044,7 +1050,7 @@ def legend_title_func(channel):
         axes_channels = (
             zip(repeat(axes), channels_to_plot)
             if same_axes
-            else zip(np.ravel(axes), channels_to_plot)
+            else zip(np.ravel(axes), channels_to_plot, strict=False)
         )
 
         for i, (ax, channel) in enumerate(axes_channels):

diff --git a/pymc_marketing/mmm/budget_optimizer.py b/pymc_marketing/mmm/budget_optimizer.py
@@ -101,7 +101,7 @@ def objective_distribution(
 
     sum_contributions = 0.0
 
-    for channel, budget in zip(channels, x):
+    for channel, budget in zip(channels, x, strict=False):
         if method == "michaelis-menten":
             L, k = parameters[channel]
             sum_contributions += michaelis_menten(budget, L, k)
@@ -182,7 +182,9 @@ def optimize_budget_distribution(
         constraints=constraints,
     )
 
-    return {channel: budget for channel, budget in zip(channels, result.x)}
+    return {
+        channel: budget for channel, budget in zip(channels, result.x, strict=False)
+    }
 
 
 def budget_allocator(

diff --git a/pymc_marketing/mmm/delayed_saturated_mmm.py b/pymc_marketing/mmm/delayed_saturated_mmm.py
@@ -652,7 +652,7 @@ def _data_setter(
         try:
             new_channel_data = X[self.channel_columns].to_numpy()
         except KeyError as e:
-            raise RuntimeError("New data must contain channel_data!", e)
+            raise RuntimeError("New data must contain channel_data!") from e
 
         def identity(x):
             return x

diff --git a/pymc_marketing/mmm/transformers.py b/pymc_marketing/mmm/transformers.py
@@ -92,7 +92,7 @@ def batched_convolution(
     if l_max is None:
         try:
             l_max = w.shape[-1].eval()
-        except Exception:
+        except Exception:  # noqa: S110
             pass
     # Get the broadcast shapes of x and w but ignoring their last dimension.
     # The last dimension of x is the "time" axis, which doesn't get broadcast

diff --git a/pymc_marketing/model_builder.py b/pymc_marketing/model_builder.py
@@ -486,7 +486,8 @@ def fit(
 
         X_df = pd.DataFrame(X, columns=X.columns)
         combined_data = pd.concat([X_df, y_df], axis=1)
-        assert all(combined_data.columns), "All columns must have non-empty names"
+        if not all(combined_data.columns):
+            raise ValueError("All columns must have non-empty names")
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",

diff --git a/pyproject.toml b/pyproject.toml
@@ -59,12 +59,21 @@ repository = "https://github.com/pymc-labs/pymc-marketing"
 #changelog = ""
 
 [tool.ruff.lint]
-select = ["E", "F", "I", "W", "UP", "RUF"]
+select = ["B", "E", "F", "I", "RUF", "S", "UP", "W"]
 ignore = [
+    "B904",   # raise-without-from-inside-except
     "RUF001", # String contains ambiguous character (such as Greek letters)
     "RUF002", # Docstring contains ambiguous character (such as Greek letters)
     "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar`
 ]
+[tool.ruff.lint.per-file-ignores]
+"docs/source/notebooks/*" = [
+    "B018", # Checks for "useless" expressions. Not useful for notebooks.
+]
+"tests/*" = [
+    "B018", # Checks for "useless" expressions. This is useful for tests.
+    "S101", # Use of assert
+]
 
 [tool.ruff.lint.pycodestyle]
 max-line-length = 120

diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py
@@ -31,14 +31,14 @@ def test_to_xarray():
     np.testing.assert_array_equal(new_x.coords["customer_id"], customer_id)
     np.testing.assert_array_equal(x, new_x.values)
 
-    for old, new in zip((x, y, z), to_xarray(customer_id, x, y, z)):
+    for old, new in zip((x, y, z), to_xarray(customer_id, x, y, z), strict=False):
         assert isinstance(new, xarray.DataArray)
         assert new.dims == ("customer_id",)
         np.testing.assert_array_equal(new.coords["customer_id"], customer_id)
         np.testing.assert_array_equal(old, new.values)
 
     new_y = to_xarray(customer_id, y, dim="test_dim")
-    new_y.dims == ("test_dim",)
+    assert new_y.dims == ("test_dim",)
     np.testing.assert_array_equal(new_y.coords["test_dim"], customer_id)
 
 

diff --git a/tests/mmm/test_lift_test.py b/tests/mmm/test_lift_test.py
@@ -98,7 +98,7 @@ def test_lift_tests_missing(df_lift_tests) -> None:
     with pytest.raises(MissingLiftTestError) as err:
         lift_test_indices(df_lift_tests, model)
 
-    err.value.missing_values.tolist() == [0]
+    assert err.value.missing_values.tolist() == [0]
 
 
 @pytest.fixture

diff --git a/tests/model_builder/test_model_builder.py b/tests/model_builder/test_model_builder.py
@@ -25,26 +25,24 @@
 
 from pymc_marketing.model_builder import ModelBuilder
 
-rng = np.random.default_rng(42)
-
 
 @pytest.fixture(scope="module")
 def toy_X():
     x = np.linspace(start=0, stop=1, num=100)
-    X = pd.DataFrame({"input": x})
-    return X
+    return pd.DataFrame({"input": x})
 
 
 @pytest.fixture(scope="module")
 def toy_y(toy_X):
+    rng = np.random.default_rng(42)
     y = 5 * toy_X["input"] + 3
-    y = y + np.random.normal(0, 1, size=len(toy_X))
+    y = y + rng.normal(0, 1, size=len(toy_X))
     y = pd.Series(y, name="output")
     return y
 
 
 @pytest.fixture(scope="module")
-def fitted_model_instance(toy_X, toy_y):
+def fitted_model_instance(toy_X):
     sampler_config = {
         "draws": 100,
         "tune": 100,
@@ -71,19 +69,18 @@ def fitted_model_instance(toy_X, toy_y):
 
 
 @pytest.fixture(scope="module")
-def not_fitted_model_instance(toy_X, toy_y):
+def not_fitted_model_instance():
     sampler_config = {"draws": 100, "tune": 100, "chains": 2, "target_accept": 0.95}
     model_config = {
         "a": {"loc": 0, "scale": 10, "dims": ("numbers",)},
         "b": {"loc": 0, "scale": 10},
         "obs_error": 2,
     }
-    model = ModelBuilderTest(
+    return ModelBuilderTest(
         model_config=model_config,
         sampler_config=sampler_config,
         test_parameter="test_paramter",
     )
-    return model
 
 
 class ModelBuilderTest(ModelBuilder):
@@ -177,12 +174,13 @@ def test_save_input_params(fitted_model_instance):
 
 
 def test_save_load(fitted_model_instance):
+    rng = np.random.default_rng(42)
     temp = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
     fitted_model_instance.save(temp.name)
     test_builder2 = ModelBuilderTest.load(temp.name)
     assert fitted_model_instance.idata.groups() == test_builder2.idata.groups()
     assert fitted_model_instance.id == test_builder2.id
-    x_pred = np.random.uniform(low=0, high=1, size=100)
+    x_pred = rng.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
     pred1 = fitted_model_instance.predict(prediction_data["input"])
     pred2 = test_builder2.predict(prediction_data["input"])
@@ -213,21 +211,23 @@ def test_empty_sampler_config_fit(toy_X, toy_y):
 
 
 def test_fit(fitted_model_instance):
+    rng = np.random.default_rng(42)
     assert fitted_model_instance.idata is not None
     assert "posterior" in fitted_model_instance.idata.groups()
     assert fitted_model_instance.idata.posterior.dims["draw"] == 100
 
-    prediction_data = pd.DataFrame(
-        {"input": np.random.uniform(low=0, high=1, size=100)}
-    )
+    prediction_data = pd.DataFrame({"input": rng.uniform(low=0, high=1, size=100)})
     fitted_model_instance.predict(prediction_data["input"])
     post_pred = fitted_model_instance.sample_posterior_predictive(
         prediction_data["input"], extend_idata=True, combined=True
     )
-    post_pred[fitted_model_instance.output_var].shape[0] == prediction_data.input.shape
+    assert (
+        post_pred[fitted_model_instance.output_var].shape[0]
+        == prediction_data.input.shape[0]
+    )
 
 
-def test_fit_no_y(toy_X):
+def test_fit_no_t(toy_X):
     model_builder = ModelBuilderTest()
     model_builder.idata = model_builder.fit(X=toy_X, chains=1, draws=100, tune=100)
     assert model_builder.model is not None
@@ -240,7 +240,8 @@ def test_fit_no_y(toy_X):
     reason="Permissions for temp files not granted on windows CI.",
 )
 def test_predict(fitted_model_instance):
-    x_pred = np.random.uniform(low=0, high=1, size=100)
+    rng = np.random.default_rng(42)
+    x_pred = rng.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
     pred = fitted_model_instance.predict(prediction_data["input"])
     # Perform elementwise comparison using numpy
@@ -250,8 +251,9 @@ def test_predict(fitted_model_instance):
 
 @pytest.mark.parametrize("combined", [True, False])
 def test_sample_posterior_predictive(fitted_model_instance, combined):
+    rng = np.random.default_rng(42)
     n_pred = 100
-    x_pred = np.random.uniform(low=0, high=1, size=n_pred)
+    x_pred = rng.uniform(low=0, high=1, size=n_pred)
     prediction_data = pd.DataFrame({"input": x_pred})
     pred = fitted_model_instance.sample_posterior_predictive(
         prediction_data["input"], combined=combined, extend_idata=True
@@ -294,6 +296,7 @@ def test_id():
 def test_sample_xxx_predictive_keeps_second(
     fitted_model_instance, toy_X, name: str
 ) -> None:
+    rng = np.random.default_rng(42)
     method_name = f"sample_{name}"
     method = getattr(fitted_model_instance, method_name)