Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pref:replacing output file name in classification to enum class #377

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 32 additions & 22 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,26 @@ def _classification_report(y_true: pd.DataFrame, y_predict: pd.DataFrame, algori
mlflow.log_artifact(os.path.join(store_path, f"Classification Report - {algorithm_name}.txt"))

@staticmethod
def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
def _cross_validation(trained_model: object, X_train: pd.DataFrame, graph_name: str, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
"""Perform cross validation on the model."""
print("-----* Cross Validation *-----")
print(f"-----* {graph_name} *-----")
print(f"K-Folds: {cv_num}")
scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num)
scores = cross_validation(trained_model, X_train, y_train, graph_name, average=average, cv_num=cv_num)
scores_str = json.dumps(scores, indent=4)
save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path)
save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)

@staticmethod
def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_confusion_matrix(
y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the confusion matrix of the model."""
print("-----* Confusion Matrix *-----")
data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path)
print(f"-----* {graph_name} *-----")
data = plot_confusion_matrix(y_test, y_test_predict, trained_model, graph_name)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
data = pd.DataFrame(data, columns=columns, index=index)
save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True)
save_data(data, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)

@staticmethod
def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
Expand Down Expand Up @@ -192,29 +194,29 @@ def _plot_precision_recall_threshold(
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print("-----* ROC Curve *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name)
save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path)
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print(f"-----* {graph_name} *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
y_probs = pd.DataFrame(y_probs, columns=["Probabilities"])
fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
tpr = pd.DataFrame(tpr, columns=["True Positive Rate"])
thresholds = pd.DataFrame(thresholds, columns=["Thresholds"])
save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path)
save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_2d_decision_boundary(
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the decision boundary of the trained model with the testing data set below."""
print("-----* Two-dimensional Decision Boundary Diagram *-----")
print(f"-----* {graph_name} *-----")
plot_2d_decision_boundary(X, X_test, trained_model, image_config)
save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path)
save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, f"{graph_name} - X", local_path, mlflow_path)
save_data(X_test, name_column2, f"{graph_name} - X Test", local_path, mlflow_path)

@staticmethod
def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple:
Expand Down Expand Up @@ -286,6 +288,7 @@ def common_components(self) -> None:
trained_model=self.model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -296,6 +299,7 @@ def common_components(self) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -326,6 +330,7 @@ def common_components(self) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -348,6 +353,7 @@ def common_components(self) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -374,6 +380,7 @@ def common_components(self, is_automl: bool) -> None:
trained_model=self.auto_model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -384,6 +391,7 @@ def common_components(self, is_automl: bool) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -414,6 +422,7 @@ def common_components(self, is_automl: bool) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -436,6 +445,7 @@ def common_components(self, is_automl: bool) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> tuple[str, Dict]:
return average, scores


def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object) -> np.ndarray:
def plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object, graph_name: str) -> np.ndarray:
"""Plot the confusion matrix.

Parameters
Expand Down Expand Up @@ -124,7 +124,7 @@ def display_cross_validation_scores(scores: np.ndarray, score_name: str) -> Dict
return cv_scores


def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int = 10) -> Dict:
def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, graph_name: str, average: str, cv_num: int = 10) -> Dict:
"""Evaluate metric(s) by cross-validation and also record fit/score times.

Parameters
Expand Down Expand Up @@ -286,7 +286,7 @@ def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame,
return y_probs, precisions, recalls, thresholds


def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple:
def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple:
"""Plot the ROC curve.

Parameters
Expand Down Expand Up @@ -324,7 +324,7 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title(f"ROC Curve - {algorithm_name}")
plt.title(f"{graph_name} - {algorithm_name}")
return y_probs, fpr, tpr, thresholds


Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path:
Whether to write the index.
"""
if name_column is not None and len(df) == len(name_column):
name_column = name_column.loc[df.index].reset_index(drop=True)
# name_column = name_column.loc[df.index].reset_index(drop=True)
df.reset_index(drop=True, inplace=True)
name_column.reset_index(drop=True, inplace=True)
df = pd.concat([name_column, df], axis=1)
Expand Down
Loading