Merge pull request ZJUEarthData#200 from ZJUEarthData/web

perf: abstract LinearWorkflowMixin with formula display for all linear models both in regression and in classification.
PotatoXi · Jul 28, 2023 · 49d5f0b · 49d5f0b
2 parents 0299e42 + a5cf654
commit 49d5f0b
Show file tree

Hide file tree

Showing 9 changed files with 146 additions and 151 deletions.
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -27,9 +27,8 @@
 # the directory where the trained model saved
 MODEL_PATH = os.path.join(OUTPUT_PATH, "trained_models")
 
-# the directory where the data is saved within the MLflow run's artifact directory
+# the directory where the artifact is saved within the MLflow run's artifact directory
 MLFLOW_ARTIFACT_DATA_PATH = "data"
-
 MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH = os.path.join("image", "statistic")
 MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH = os.path.join("image", "model_output")
 MLFLOW_ARTIFACT_IMAGE_MAP_PATH = os.path.join("image", "map")

diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py
@@ -15,7 +15,7 @@
 from ..constants import SECTION
 from ..data.data_readiness import limit_num_input, num2option, num_input, show_data_columns
 from ..utils.base import save_data, save_fig, save_text
-from .func._common_supervised import plot_decision_tree, plot_feature_importance
+from .func._common_supervised import plot_decision_tree, plot_feature_importance, show_formula
 
 
 class WorkflowBase(metaclass=ABCMeta):
@@ -300,8 +300,21 @@ def _plot_feature_importance(X_train: pd.DataFrame, trained_model: object, image
         save_fig(f"Feature Importance - {algorithm_name}", local_path, mlflow_path)
         save_data(data, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True)
 
-    def _plot_tree(self, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    @staticmethod
+    def _plot_tree(trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Drawing decision tree diagrams."""
         print("-----* Single Tree Diagram *-----")
         plot_decision_tree(trained_model, image_config)
         save_fig(f"Tree Diagram - {algorithm_name}", local_path, mlflow_path)
+
+
+class LinearWorkflowMixin:
+    """Mixin class for linear models."""
+
+    @staticmethod
+    def _show_formula(coef: np.ndarray, intercept: np.ndarray, features_name: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+        """Show the formula."""
+        print(f"-----* {algorithm_name} Formula *-----")
+        formula = show_formula(coef, intercept, features_name)
+        formula_str = json.dumps(formula, indent=4)
+        save_text(formula_str, f"{algorithm_name} Formula", local_path, mlflow_path)
diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -19,7 +19,7 @@
 
 from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, RAY_FLAML
 from ..utils.base import save_data, save_fig, save_text
-from ._base import TreeWorkflowMixin, WorkflowBase
+from ._base import LinearWorkflowMixin, TreeWorkflowMixin, WorkflowBase
 from .func.algo_classification._common import cross_validation, plot_2d_decision_boundary, plot_confusion_matrix, plot_precision_recall, plot_ROC, score
 from .func.algo_classification._decision_tree import decision_tree_manual_hyper_parameters
 from .func.algo_classification._deep_neural_network import deep_neural_network_manual_hyper_parameters
@@ -1485,11 +1485,11 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:
         )
 
 
-class LogisticRegressionClassification(ClassificationWorkflowBase):
+class LogisticRegressionClassification(LinearWorkflowMixin, ClassificationWorkflowBase):
     """The automation workflow of using Logistic Regression algorithm to make insightful products."""
 
     name = "Logistic Regression"
-    special_function = ["Feature Importance"]
+    special_function = ["Logistic Regression Formula", "Feature Importance Diagram"]
 
     def __init__(
         self,
@@ -1729,6 +1729,15 @@ def _plot_feature_importance(columns_name: np.ndarray, trained_model: any, algor
     def special_components(self, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        self._show_formula(
+            coef=self.model.coef_,
+            intercept=self.model.intercept_,
+            features_name=LogisticRegressionClassification.X.columns,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+            mlflow_path="root",
+        )
         self._plot_feature_importance(
             columns_name=LogisticRegressionClassification.X.columns,
             trained_model=self.model,
@@ -1741,6 +1750,15 @@ def special_components(self, **kwargs) -> None:
     def special_components(self, is_automl: bool = False, **kwargs) -> None:
         """Invoke all special application functions for this algorithms by FLAML framework."""
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
+        GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
+        self._show_formula(
+            coef=self.auto_model.coef_,
+            intercept=self.auto_model.intercept_,
+            features_name=LogisticRegressionClassification.X.columns,
+            algorithm_name=self.naming,
+            local_path=GEOPI_OUTPUT_ARTIFACTS_PATH,
+            mlflow_path="root",
+        )
         self._plot_feature_importance(
             columns_name=LogisticRegressionClassification.X.columns,
             trained_model=self.auto_model,

diff --git a/geochemistrypi/data_mining/model/func/_common_supervised.py b/geochemistrypi/data_mining/model/func/_common_supervised.py
@@ -7,7 +7,7 @@
 from sklearn.tree import plot_tree
 
 # <------
-# Used by decsion tree including classification and regression
+# Used by tree-based models including classification and regression besides XGBoost
 
 
 def plot_decision_tree(trained_model: object, image_config: Dict) -> None:
@@ -72,7 +72,7 @@ def plot_decision_tree(trained_model: object, image_config: Dict) -> None:
     )
 
 
-# Used by decsion tree including classification and regression
+# Used by tree-based models including classification and regression besides XGBoost
 # ------>
 
 # <------
@@ -139,3 +139,60 @@ def plot_feature_importance(columns_name: pd.Index, feature_importance: np.ndarr
     )
 
     return importance
+
+
+# Used by tree-based models, like, random forest, extra-trees, xgboost including classification and regression
+# ------>
+
+# <------
+# Used by linear models including classification and regression
+
+
+def show_formula(coef: np.ndarray, intercept: np.ndarray, features_name: np.ndarray) -> Dict:
+    """Show the formula of linear models.
+
+    Parameters
+    ----------
+    coef : array
+        Coefficient of the features in the decision function.
+
+    intercept : array
+        Independent term in decision function.
+
+    features_name : np.ndarray
+        Name of the features.
+
+    Returns
+    -------
+    formula : dict
+        The formula of linear models.
+    """
+    term = []
+    coef = np.around(coef, decimals=3).tolist()[0]
+
+    for i in range(len(coef)):
+        # the first value stay the same
+        if i == 0:
+            # not append if zero
+            if coef[i] != 0:
+                temp = str(coef[i]) + features_name[i]
+                term.append(temp)
+        else:
+            # add plus symbol if positive, maintain if negative, not append if zero
+            if coef[i] > 0:
+                temp = "+" + str(coef[i]) + features_name[i]
+                term.append(temp)
+            elif coef[i] < 0:
+                temp = str(coef[i]) + features_name[i]
+                term.append(temp)
+    if intercept[0] >= 0:
+        formula = "".join(term) + "+" + str(intercept[0])
+    else:
+        formula = "".join(term) + str(intercept[0])
+    print("y =", formula)
+
+    return {"y": formula}
+
+
+# Used by linear models including classification and regression
+# ------>
diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_common.py b/geochemistrypi/data_mining/model/func/algo_classification/_common.py
@@ -141,42 +141,6 @@ def cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.D
     return scores_result
 
 
-# def contour_data(X: pd.DataFrame, trained_model: object) -> Tuple[List[np.ndarray], np.ndarray]:
-#     """Build up coordinate matrices as the data of contour plot.
-
-#     Parameters
-#     ----------
-#     X : pd.DataFrame (n_samples, n_components)
-#         The complete feature data.
-
-#     trained_model : object
-#         Te algorithm model class from sklearn is trained.
-
-#     Returns
-#     -------
-#     matrices : List[np.ndarray]
-#         Coordinate matrices.
-
-#     labels : np.ndarray
-#         Predicted value by the trained model with coordinate data as input data.
-#     """
-
-#     # build up coordinate matrices from coordinate vectors.
-#     xi = [np.arange(X.iloc[:, i].min(), X.iloc[:, i].max(), (X.iloc[:, i].max() - X.iloc[:, i].min()) / 50) for i in range(X.shape[1])]
-#     ndim = len(xi)
-#     s0 = (1,) * ndim
-#     matrices = [np.asanyarray(x).reshape(s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi)]
-#     matrices[0].shape = (1, -1) + s0[2:]
-#     matrices[1].shape = (-1, 1) + s0[2:]
-#     matrices = np.broadcast_arrays(*matrices, subok=True)
-
-#     # get the labels of the coordinate matrices through the trained model
-#     input_array = np.column_stack((i.ravel() for i in matrices))
-#     labels = trained_model.predict(input_array).reshape(matrices[0].shape)
-
-#     return matrices, labels
-
-
 def plot_precision_recall(X_test, y_test, trained_model: object, algorithm_name: str) -> tuple:
     """Plot the precision-recall curve.
 

diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_linear_regression.py
@@ -2,7 +2,6 @@
 from typing import Dict
 
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 from mpl_toolkits.mplot3d import Axes3D
 from rich import print
@@ -30,33 +29,6 @@ def linear_regression_manual_hyper_parameters() -> Dict:
     return hyper_parameters
 
 
-def show_formula(coef, intercept, columns_name):
-    term = []
-    coef = np.around(coef, decimals=3).tolist()[0]
-
-    for i in range(len(coef)):
-        # the first value stay the same
-        if i == 0:
-            # not append if zero
-            if coef[i] != 0:
-                temp = str(coef[i]) + columns_name[i]
-                term.append(temp)
-        else:
-            # add plus symbol if positive, maintain if negative, not append if zero
-            if coef[i] > 0:
-                temp = "+" + str(coef[i]) + columns_name[i]
-                term.append(temp)
-            elif coef[i] < 0:
-                temp = str(coef[i]) + columns_name[i]
-                term.append(temp)
-    if intercept[0] >= 0:
-        # formula of linear regression
-        formula = "".join(term) + "+" + str(intercept[0])
-    else:
-        formula = "".join(term) + str(intercept[0])
-    print("y =", formula)
-
-
 def plot_2d_graph(feature_data: pd.DataFrame, target_data: pd.DataFrame = None) -> None:
     """Plot a 2D graph with the data set below.
 

diff --git a/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py b/geochemistrypi/data_mining/model/func/algo_regression/_polynomial_regression.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
-from typing import Dict, List
+from typing import Dict
 
-import numpy as np
 from rich import print
 
 from ....constants import SECTION
@@ -28,43 +27,3 @@ def polynomial_regression_manual_hyper_parameters() -> Dict:
     include_bias = bool(str_input(include_biases, SECTION[2]))
     hyper_parameters = {"degree": degree, "interaction_only": interaction_only, "include_bias": include_bias}
     return hyper_parameters
-
-
-def show_formula(coef: np.ndarray, intercept: np.ndarray, features_name: List) -> None:
-    """Show the formula of polynomial regression.
-
-    Parameters
-    ----------
-    coef : array
-        Coefficient of the features in the decision function.
-
-    intercept : array
-        Independent term in decision function.
-
-    features_name : list
-        Name of the features.
-    """
-    term = []
-    coef = np.around(coef, decimals=3).tolist()[0]
-
-    for i in range(len(coef)):
-        # the first value stay the same
-        if i == 0:
-            # not append if zero
-            if coef[i] != 0:
-                temp = str(coef[i]) + features_name[i]
-                term.append(temp)
-        else:
-            # add plus symbol if positive, maintain if negative, not append if zero
-            if coef[i] > 0:
-                temp = "+" + str(coef[i]) + features_name[i]
-                term.append(temp)
-            elif coef[i] < 0:
-                temp = str(coef[i]) + features_name[i]
-                term.append(temp)
-    if intercept[0] >= 0:
-        # formula of polynomial regression
-        formula = "".join(term) + "+" + str(intercept[0])
-    else:
-        formula = "".join(term) + str(intercept[0])
-    print("y =", formula)