Merge pull request #72 from uclamii/eval_metrics_ls

Add new eval_metrics.py script for model performance metrics
uclamii · Oct 24, 2024 · fd8dc38 · fd8dc38
2 parents 3a900a8 + 60a9c23
commit fd8dc38
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 25 deletions.
diff --git a/notebooks/catboost_kfold.py b/notebooks/catboost_kfold.py
@@ -17,9 +17,6 @@
     f"{estimator_name}__depth": [10],
     f"{estimator_name}__learning_rate": [1e-4],
     f"{estimator_name}__n_estimators": [30],
-    f"{estimator_name}__early_stopping_rounds": [10],
-    f"{estimator_name}__verbose": [0],
-    f"{estimator_name}__eval_metric": ["Logloss"],
 }
 
 model = Model(
@@ -31,15 +28,15 @@
     grid=tuned_parameters,
     randomized_grid=False,
     n_iter=4,
-    boost_early=True,
+    boost_early=False,
     scoring=["roc_auc"],
     n_jobs=-2,
     random_state=42,
     kfold=True,
 )
 
 
-model.grid_search_param_tuning(X, y)
+model.grid_search_param_tuning(X, y, f1_beta_tune=True)
 
 
 model.fit(X, y)
@@ -49,3 +46,5 @@
 
 ### F1 Weighted
 y_pred = model.predict(X)
+
+metrics = model.return_metrics(X, y)
diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py
@@ -12,6 +12,11 @@
     mean_absolute_error,
     median_absolute_error,
     r2_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+    average_precision_score,
+    brier_score_loss,
 )
 
 from skopt import BayesSearchCV
@@ -590,7 +595,6 @@ def fit(self, X, y, validation_data=None, score=None):
                     self.kf,
                     scoring=scorer,
                 )
-
         else:
             if score is None:
                 best_params = self.best_params_per_score[self.scoring[0]]["params"]
@@ -791,6 +795,7 @@ def return_metrics(self, X_test, y_test, optimal_threshold=False):
 
                     print("The model is trained on the full development set.")
                     print("The scores are computed on the full evaluation set." + "\n")
+                    self.return_metrics_kfold(X_test, y_test, self.test_model, score)
 
                 else:
                     self.regression_report_kfold(X_test, y_test, self.test_model, score)
@@ -808,7 +813,10 @@ def return_metrics(self, X_test, y_test, optimal_threshold=False):
                     conf_mat = confusion_matrix(y_test, y_pred_valid)
                     print("Confusion matrix on set provided: ")
                     _confusion_matrix_print(conf_mat, self.labels)
-
+                    model_metrics_df = report_model_metrics(self, X_test, y_test)
+                    print("-" * 80)
+                    pprint(model_metrics_df.iloc[0].to_dict())
+                    print("-" * 80)
                 print()
                 self.classification_report = classification_report(
                     y_test, y_pred_valid, output_dict=True
@@ -894,13 +902,11 @@ def grid_search_param_tuning(
                         thresh_list = []
                         self.kfold = False
                         for train, test in self.kf.split(X, y):
+
                             self.fit(X.iloc[train], y.iloc[train])
                             y_pred_proba = self.predict_proba(X.iloc[test])[:, 1]
                             thresh = self.tune_threshold_Fbeta(
                                 score,
-                                X.iloc[train],
-                                y.iloc[train],
-                                X.iloc[test],
                                 y.iloc[test],
                                 betas,
                                 y_pred_proba,
@@ -909,18 +915,18 @@ def grid_search_param_tuning(
                             thresh_list.append(thresh)
                         average_threshold = np.mean(thresh_list)
                         self.threshold[score] = average_threshold
+                        self.kfold = True
+
                 else:
                     for score in self.scoring:
                         thresh_list = []
                         self.kfold = False
                         for train, test in self.kf.split(X, y):
+
                             self.fit(X[train], y[train])
                             y_pred_proba = self.predict_proba(X[test])[:, 1]
                             thresh = self.tune_threshold_Fbeta(
                                 score,
-                                X[train],
-                                y[train],
-                                X[test],
                                 y[test],
                                 betas,
                                 y_pred_proba,
@@ -965,13 +971,6 @@ def grid_search_param_tuning(
                     ### a new model to be fitted each time.
                     self.reset_estimator()
                     if self.boost_early:
-                        estimator_verbosity = f"{self.estimator_name}__verbose"
-
-                        if params.get(estimator_verbosity):
-                            self.verbosity = params[estimator_verbosity]
-                            params.pop(estimator_verbosity)
-                        else:
-                            self.verbosity = False
 
                         if self.feature_selection or self.pipeline_steps:
                             # Extract parameters for preprocessing and feature selection
@@ -1025,15 +1024,17 @@ def grid_search_param_tuning(
                         estimator_eval_set = f"{self.estimator_name}__eval_set"
                         estimator_verbosity = f"{self.estimator_name}__verbose"
 
+                        if params.get(estimator_verbosity):
+                            self.verbosity = params[estimator_verbosity]
+                            params.pop(estimator_verbosity)
+                        else:
+                            self.verbosity = False
+
                         xgb_params = {
                             estimator_eval_set: eval_set,
                             estimator_verbosity: self.verbosity,
                         }
 
-                        ## TODO: Why are we popping verbosity off?
-                        if estimator_verbosity in params:
-                            params.pop(estimator_verbosity)
-
                         clf = self.estimator.set_params(**params).fit(
                             X_train, y_train, **xgb_params
                         )
@@ -1292,6 +1293,44 @@ def get_best_score_params(self, X, y):
             }
             # self.estimator = clf.best_estimator_
 
+    def return_metrics_kfold(self, X, y, test_model, score=None):
+
+        aggregated_pred_list = []
+        if score is not None:
+            threshold = self.threshold[score]
+        else:
+            threshold = self.threshold[self.scoring[0]]
+
+        if threshold == 0:
+            threshold = 0.5
+
+        if isinstance(X, pd.DataFrame):
+            for train, test in self.kf.split(X, y):
+                X_train, X_test = X.iloc[train], X.iloc[test]
+                y_train, y_test = y.iloc[train], y.iloc[test]
+                test_model.fit(X_train, y_train)
+                aggregated_pred_list.append(
+                    report_model_metrics(test_model, X_test, y_test, threshold),
+                )
+        else:
+            for train, test in self.kf.split(X, y):
+                X_train, X_test = X[train], X[test]
+                y_train, y_test = y[train], y[test]
+                test_model.fit(X_train, y_train)
+                aggregated_pred_list.append(
+                    report_model_metrics(test_model, X_test, y_test, threshold),
+                )
+
+        concat_df = pd.concat(aggregated_pred_list)
+        # Calculate the mean for each column
+        mean_df = concat_df.groupby(concat_df.index).mean()
+        mean_dict = mean_df.iloc[0].to_dict()
+        print("-" * 80)
+        print(f"Average performance across {len(aggregated_pred_list)} Folds:")
+        pprint(mean_dict)
+        print("-" * 80)
+        return mean_dict
+
     def conf_mat_class_kfold(self, X, y, test_model, score=None):
 
         aggregated_true_labels = []
@@ -1337,7 +1376,13 @@ def conf_mat_class_kfold(self, X, y, test_model, score=None):
         )
         # Now, outside the fold loop, calculate and print the overall classification report
         print(f"Classification Report Averaged Across All Folds for {score}:")
-        print(self.classification_report)
+        print(
+            classification_report(
+                aggregated_true_labels,
+                aggregated_predictions,
+                zero_division=0,
+            )
+        )
         print("-" * 80)
         return {
             "Classification Report": self.classification_report,
@@ -1600,3 +1645,57 @@ def print_pipeline(pipeline):
     print()
 
 
+def report_model_metrics(
+    model,
+    X_valid=None,
+    y_valid=None,
+    threshold=0.5,
+):
+    """
+    Generate a DataFrame of model performance metrics for given models,
+    predictions, or probability estimates.
+
+    Parameters:
+    -----------
+
+    X_valid : DataFrame, optional
+        The feature set used for validating the model(s).
+
+    y_valid : Series, optional
+        The true labels for the validation set.
+
+
+    Returns:
+    --------
+    metrics_df : DataFrame
+        A DataFrame containing calculated metrics for each model or prediction
+        column, with metrics including:
+        - Precision/PPV
+        - Average Precision
+        - Sensitivity (Recall)
+        - Specificity
+        - AUC ROC
+        - Brier Score
+    """
+
+    metrics = {}
+    y_pred_proba = model.predict_proba(X_valid)[:, 1]
+    y_pred = [1 if pred > threshold else 0 for pred in y_pred_proba]
+    tn, fp, fn, tp = confusion_matrix(y_valid, y_pred).ravel()
+    precision = precision_score(y_valid, y_pred)
+    recall = recall_score(y_valid, y_pred)
+    roc_auc = roc_auc_score(y_valid, y_pred_proba)
+    brier_score = brier_score_loss(y_valid, y_pred_proba)
+    avg_precision = average_precision_score(y_valid, y_pred_proba)
+    specificity = tn / (tn + fp)
+    metrics = {
+        "Precision/PPV": precision,
+        "Average Precision": avg_precision,
+        "Sensitivity": recall,
+        "Specificity": specificity,
+        "AUC ROC": roc_auc,
+        "Brier Score": brier_score,
+    }
+
+    metrics_df = pd.DataFrame(metrics, index=[0])
+    return metrics_df