Skip to content

Commit

Permalink
Merge pull request #72 from uclamii/eval_metrics_ls
Browse files Browse the repository at this point in the history
Add new eval_metrics.py script for model performance metrics
  • Loading branch information
elemets authored Oct 24, 2024
2 parents 3a900a8 + 60a9c23 commit fd8dc38
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 25 deletions.
9 changes: 4 additions & 5 deletions notebooks/catboost_kfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@
f"{estimator_name}__depth": [10],
f"{estimator_name}__learning_rate": [1e-4],
f"{estimator_name}__n_estimators": [30],
f"{estimator_name}__early_stopping_rounds": [10],
f"{estimator_name}__verbose": [0],
f"{estimator_name}__eval_metric": ["Logloss"],
}

model = Model(
Expand All @@ -31,15 +28,15 @@
grid=tuned_parameters,
randomized_grid=False,
n_iter=4,
boost_early=True,
boost_early=False,
scoring=["roc_auc"],
n_jobs=-2,
random_state=42,
kfold=True,
)


model.grid_search_param_tuning(X, y)
model.grid_search_param_tuning(X, y, f1_beta_tune=True)


model.fit(X, y)
Expand All @@ -49,3 +46,5 @@

### F1 Weighted
y_pred = model.predict(X)

metrics = model.return_metrics(X, y)
139 changes: 119 additions & 20 deletions src/model_tuner/model_tuner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
mean_absolute_error,
median_absolute_error,
r2_score,
precision_score,
recall_score,
roc_auc_score,
average_precision_score,
brier_score_loss,
)

from skopt import BayesSearchCV
Expand Down Expand Up @@ -590,7 +595,6 @@ def fit(self, X, y, validation_data=None, score=None):
self.kf,
scoring=scorer,
)

else:
if score is None:
best_params = self.best_params_per_score[self.scoring[0]]["params"]
Expand Down Expand Up @@ -791,6 +795,7 @@ def return_metrics(self, X_test, y_test, optimal_threshold=False):

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set." + "\n")
self.return_metrics_kfold(X_test, y_test, self.test_model, score)

else:
self.regression_report_kfold(X_test, y_test, self.test_model, score)
Expand All @@ -808,7 +813,10 @@ def return_metrics(self, X_test, y_test, optimal_threshold=False):
conf_mat = confusion_matrix(y_test, y_pred_valid)
print("Confusion matrix on set provided: ")
_confusion_matrix_print(conf_mat, self.labels)

model_metrics_df = report_model_metrics(self, X_test, y_test)
print("-" * 80)
pprint(model_metrics_df.iloc[0].to_dict())
print("-" * 80)
print()
self.classification_report = classification_report(
y_test, y_pred_valid, output_dict=True
Expand Down Expand Up @@ -894,13 +902,11 @@ def grid_search_param_tuning(
thresh_list = []
self.kfold = False
for train, test in self.kf.split(X, y):

self.fit(X.iloc[train], y.iloc[train])
y_pred_proba = self.predict_proba(X.iloc[test])[:, 1]
thresh = self.tune_threshold_Fbeta(
score,
X.iloc[train],
y.iloc[train],
X.iloc[test],
y.iloc[test],
betas,
y_pred_proba,
Expand All @@ -909,18 +915,18 @@ def grid_search_param_tuning(
thresh_list.append(thresh)
average_threshold = np.mean(thresh_list)
self.threshold[score] = average_threshold
self.kfold = True

else:
for score in self.scoring:
thresh_list = []
self.kfold = False
for train, test in self.kf.split(X, y):

self.fit(X[train], y[train])
y_pred_proba = self.predict_proba(X[test])[:, 1]
thresh = self.tune_threshold_Fbeta(
score,
X[train],
y[train],
X[test],
y[test],
betas,
y_pred_proba,
Expand Down Expand Up @@ -965,13 +971,6 @@ def grid_search_param_tuning(
### a new model to be fitted each time.
self.reset_estimator()
if self.boost_early:
estimator_verbosity = f"{self.estimator_name}__verbose"

if params.get(estimator_verbosity):
self.verbosity = params[estimator_verbosity]
params.pop(estimator_verbosity)
else:
self.verbosity = False

if self.feature_selection or self.pipeline_steps:
# Extract parameters for preprocessing and feature selection
Expand Down Expand Up @@ -1025,15 +1024,17 @@ def grid_search_param_tuning(
estimator_eval_set = f"{self.estimator_name}__eval_set"
estimator_verbosity = f"{self.estimator_name}__verbose"

if params.get(estimator_verbosity):
self.verbosity = params[estimator_verbosity]
params.pop(estimator_verbosity)
else:
self.verbosity = False

xgb_params = {
estimator_eval_set: eval_set,
estimator_verbosity: self.verbosity,
}

## TODO: Why are we popping verbosity off?
if estimator_verbosity in params:
params.pop(estimator_verbosity)

clf = self.estimator.set_params(**params).fit(
X_train, y_train, **xgb_params
)
Expand Down Expand Up @@ -1292,6 +1293,44 @@ def get_best_score_params(self, X, y):
}
# self.estimator = clf.best_estimator_

def return_metrics_kfold(self, X, y, test_model, score=None):

aggregated_pred_list = []
if score is not None:
threshold = self.threshold[score]
else:
threshold = self.threshold[self.scoring[0]]

if threshold == 0:
threshold = 0.5

if isinstance(X, pd.DataFrame):
for train, test in self.kf.split(X, y):
X_train, X_test = X.iloc[train], X.iloc[test]
y_train, y_test = y.iloc[train], y.iloc[test]
test_model.fit(X_train, y_train)
aggregated_pred_list.append(
report_model_metrics(test_model, X_test, y_test, threshold),
)
else:
for train, test in self.kf.split(X, y):
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
test_model.fit(X_train, y_train)
aggregated_pred_list.append(
report_model_metrics(test_model, X_test, y_test, threshold),
)

concat_df = pd.concat(aggregated_pred_list)
# Calculate the mean for each column
mean_df = concat_df.groupby(concat_df.index).mean()
mean_dict = mean_df.iloc[0].to_dict()
print("-" * 80)
print(f"Average performance across {len(aggregated_pred_list)} Folds:")
pprint(mean_dict)
print("-" * 80)
return mean_dict

def conf_mat_class_kfold(self, X, y, test_model, score=None):

aggregated_true_labels = []
Expand Down Expand Up @@ -1337,7 +1376,13 @@ def conf_mat_class_kfold(self, X, y, test_model, score=None):
)
# Now, outside the fold loop, calculate and print the overall classification report
print(f"Classification Report Averaged Across All Folds for {score}:")
print(self.classification_report)
print(
classification_report(
aggregated_true_labels,
aggregated_predictions,
zero_division=0,
)
)
print("-" * 80)
return {
"Classification Report": self.classification_report,
Expand Down Expand Up @@ -1600,3 +1645,57 @@ def print_pipeline(pipeline):
print()


def report_model_metrics(
model,
X_valid=None,
y_valid=None,
threshold=0.5,
):
"""
Generate a DataFrame of model performance metrics for given models,
predictions, or probability estimates.
Parameters:
-----------
X_valid : DataFrame, optional
The feature set used for validating the model(s).
y_valid : Series, optional
The true labels for the validation set.
Returns:
--------
metrics_df : DataFrame
A DataFrame containing calculated metrics for each model or prediction
column, with metrics including:
- Precision/PPV
- Average Precision
- Sensitivity (Recall)
- Specificity
- AUC ROC
- Brier Score
"""

metrics = {}
y_pred_proba = model.predict_proba(X_valid)[:, 1]
y_pred = [1 if pred > threshold else 0 for pred in y_pred_proba]
tn, fp, fn, tp = confusion_matrix(y_valid, y_pred).ravel()
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_pred_proba)
brier_score = brier_score_loss(y_valid, y_pred_proba)
avg_precision = average_precision_score(y_valid, y_pred_proba)
specificity = tn / (tn + fp)
metrics = {
"Precision/PPV": precision,
"Average Precision": avg_precision,
"Sensitivity": recall,
"Specificity": specificity,
"AUC ROC": roc_auc,
"Brier Score": brier_score,
}

metrics_df = pd.DataFrame(metrics, index=[0])
return metrics_df

0 comments on commit fd8dc38

Please sign in to comment.