Merge pull request #169 from ZJUEarthData/web

feat: add MLflow into current workflow.
ZJUEarthData · Jun 28, 2023 · e8201f6 · e8201f6
2 parents d762317 + 32690e6
commit e8201f6
Show file tree

Hide file tree

Showing 39 changed files with 187 additions and 109 deletions.
diff --git a/.gitignore b/.gitignore
@@ -62,3 +62,4 @@ docs/build
 
 # mlflow
 mlruns
+geopi_tracking
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -1,14 +1,13 @@
 # -*- coding: utf-8 -*-
 import os
+from time import sleep
 
+import mlflow
 from rich import print
+from rich.console import Console
+from rich.prompt import Confirm, Prompt
 
-from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
-from .data.feature_engineering import FeatureConstructor
-from .data.imputation import imputer
-from .data.preprocessing import feature_scaler
-from .data.statistic import monte_carlo_simulator
-from .global_variable import (
+from .constants import (
     CLASSIFICATION_MODELS,
     CLUSTERING_MODELS,
     DATASET_OUTPUT_PATH,
@@ -27,14 +26,21 @@
     SECTION,
     STATISTIC_IMAGE_PATH,
     TEST_DATA_OPTION,
+    WORKING_PATH,
 )
+from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
+from .data.feature_engineering import FeatureConstructor
+from .data.imputation import imputer
+from .data.preprocessing import feature_scaler
+from .data.statistic import monte_carlo_simulator
 from .plot.map_plot import map_projected
 from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, logged_distribution_plot, probability_plot, ratio_null_vs_filled
 from .process.classify import ClassificationModelSelection
 from .process.cluster import ClusteringModelSelection
 from .process.decompose import DecompositionModelSelection
 from .process.regress import RegressionModelSelection
 from .utils.base import clear_output, log, save_data, show_warning
+from .utils.mlflow_utils import retrieve_previous_experiment_id
 
 # create the directories if they didn't exist yet
 os.makedirs(MODEL_OUTPUT_IMAGE_PATH, exist_ok=True)
@@ -46,16 +52,47 @@
 
 
 def cli_pipeline(file_name: str) -> None:
-    """The command line interface for GeochemistryPy."""
-
-    print("Geochemistry Py v0.2.1 - Beta Version")
-    print("....... Initializing .......")
-    logger = log(OUTPUT_PATH, "inner_test.log")
-    logger.info("Geochemistry Py v.1.0.0 - beta version")
+    """The command line interface for Geochemistry π."""
 
     # If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True.
     show_warning(False)
 
+    logger = log(OUTPUT_PATH, "inner_test.log")
+    logger.info("Geochemistry Pi is running.")
+
+    # Display the interactive splash screen when launching the CLI software
+    console = Console()
+    console.print("\n[bold blue]Welcome to Geochemistry Pi![/bold blue]")
+    console.print("[bold]Initializing...[/bold]")
+    with console.status("[bold green]Loading...[/bold green]", spinner="dots"):
+        sleep(2)
+    console.print("✨ Input Template [bold magenta][Option1/Option2][/bold magenta] [bold cyan](Default Value)[/bold cyan]: Input Value")
+    # Create a new experiment or use the previous experiment
+    is_used_previous_experiment = Confirm.ask("✨ Use Previous Experiment", default=False)
+    # Set the tracking uri to the local directory, in the future, we can set it to the remote server.
+    artifact_localtion = f"file:{WORKING_PATH}/geopi_tracking"
+    mlflow.set_tracking_uri(artifact_localtion)
+    # print("tracking uri:", mlflow.get_tracking_uri())
+    if is_used_previous_experiment:
+        old_experiment_id = None
+        # If the user doesn't provide the correct experiment name, then ask the user to input again.
+        while not old_experiment_id:
+            old_experiment_name = Prompt.ask("✨ Previous Experiment Name")
+            old_experiment_id = retrieve_previous_experiment_id(old_experiment_name)
+        mlflow.set_experiment(experiment_id=old_experiment_id)
+        experiment = mlflow.get_experiment(experiment_id=old_experiment_id)
+    else:
+        new_experiment_name = Prompt.ask("✨ New Experiment", default="GeoPi - Rock Classification")
+        new_experiment_tag = Prompt.ask("✨ Experiment Tag Version", default="E - v1.0.0")
+        new_experiment_id = mlflow.create_experiment(name=new_experiment_name, artifact_location=artifact_localtion, tags={"version": new_experiment_tag})
+        experiment = mlflow.get_experiment(experiment_id=new_experiment_id)
+    # print("Artifact Location: {}".format(experiment.artifact_location))
+
+    run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm")
+    run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
+    run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
+    mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
+
     # Data Loading
     logger.debug("User Data Uploaded")
     print("-*-*- User Data Loading -*-*-")
@@ -236,7 +273,7 @@ def cli_pipeline(file_name: str) -> None:
                 clear_output()
                 break
         else:
-            save_data(data_processed_imputed, "Data Before Splitting", DATASET_OUTPUT_PATH)
+            save_data(data_processed_imputed, "Data-Before-Splitting", DATASET_OUTPUT_PATH)
             clear_output()
             break
 
@@ -366,3 +403,5 @@ def cli_pipeline(file_name: str) -> None:
             run = Modes2Initiators[mode_num](MODELS[i])
             run.activate(X, y, X_train, X_test, y_train, y_test)
             clear_output()
+
+    mlflow.end_run()
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -1 +1,68 @@
+import os
+
+# The number of uploading dataset per user is limited to 5.
 MAX_UPLOADS_PER_USER = 5
+
+# current working directory in which the user activates the application
+WORKING_PATH = os.getcwd()
+
+# the directory in which the package(application) is installed
+PACKAGEDIR = os.path.dirname(os.path.realpath(__file__))
+
+# the directory where the built-in data set to be processed stays
+BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")
+
+# the root directory where all the output stays
+OUTPUT_PATH = os.path.join(WORKING_PATH, "output")
+
+# the directory where the data set produced stays
+DATASET_OUTPUT_PATH = os.path.join(OUTPUT_PATH, "data")
+
+# the directory where pictures saved
+MODEL_OUTPUT_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "model_output")
+STATISTIC_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "statistic")
+MAP_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "map")
+GEO_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "geochemistry")
+
+# the directory where the trained model saved
+MODEL_PATH = os.path.join(OUTPUT_PATH, "trained_models")
+
+
+# Tell which section the user is currently in on the UML
+SECTION = ["User", "Data", "Model", "Plot"]
+
+OPTION = ["Yes", "No"]
+DATA_OPTION = ["Own Data", "Testing Data (Built-in)"]
+TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction"]
+MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction"]
+
+# The model provided to use
+REGRESSION_MODELS = [
+    "Linear Regression",
+    "Polynomial Regression",
+    "Support Vector Machine",
+    "Decision Tree",
+    "Random Forest",
+    "Extra-Trees",
+    "Xgboost",
+    "Deep Neural Network",
+]
+CLASSIFICATION_MODELS = [
+    "Logistic Regression",
+    "Support Vector Machine",
+    "Decision Tree",
+    "Random Forest",
+    "Extra-Trees",
+    "Xgboost",
+    "Deep Neural Network",
+]
+CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
+DECOMPOSITION_MODELS = ["Principal Component Analysis", "T-SNE"]
+
+# Special AutoML models
+NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
+RAY_FLAML = ["Deep Neural Network"]
+
+IMPUTING_STRATEGY = ["Mean Value", "Median Value", "Most Frequent Value", "Constant(Specified Value)"]
+
+FEATURE_SCALING_STRATEGY = ["Min-max Scaling", "Standardization"]
diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py
@@ -8,7 +8,7 @@
 from rich import print
 from sklearn.model_selection import train_test_split
 
-from ..global_variable import BUILT_IN_DATASET_PATH
+from ..constants import BUILT_IN_DATASET_PATH
 
 # from utils.exceptions import InvalidFileError
 

diff --git a/geochemistrypi/data_mining/data/imputation.py b/geochemistrypi/data_mining/data/imputation.py
@@ -4,7 +4,7 @@
 from rich import print
 from sklearn.impute import SimpleImputer
 
-from ..global_variable import SECTION
+from ..constants import SECTION
 from .data_readiness import float_input
 
 

diff --git a/geochemistrypi/data_mining/global_variable.py b/geochemistrypi/data_mining/global_variable.py
diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py
@@ -11,8 +11,8 @@
 from multipledispatch import dispatch
 from rich import print
 
+from ..constants import MODEL_PATH, SECTION
 from ..data.data_readiness import limit_num_input, num2option, num_input, show_data_columns
-from ..global_variable import MODEL_PATH, SECTION
 from ..utils.base import save_data
 
 

diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py
@@ -14,7 +14,7 @@
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 
-from ..global_variable import MODEL_OUTPUT_IMAGE_PATH, RAY_FLAML
+from ..constants import MODEL_OUTPUT_IMAGE_PATH, RAY_FLAML
 from ..utils.base import save_fig
 from ._base import WorkflowBase
 from .func.algo_classification._common import confusion_matrix_plot, cross_validation, plot_precision_recall, plot_ROC

diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 from typing import Dict, Optional, Union
 
+import mlflow
 import numpy as np
 import pandas as pd
 from rich import print
 from sklearn import metrics
 from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans
 
-from ..global_variable import DATASET_OUTPUT_PATH, MODEL_OUTPUT_IMAGE_PATH
+from ..constants import DATASET_OUTPUT_PATH, MODEL_OUTPUT_IMAGE_PATH
 from ..utils.base import save_data, save_fig
 from ._base import WorkflowBase
 from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
@@ -27,6 +28,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
         """Fit the model according to the given training data."""
         self.X = X
         self.model.fit(X)
+        mlflow.log_params(self.model.get_params())
 
     @classmethod
     def manual_hyper_parameters(cls) -> Dict:
@@ -177,6 +179,9 @@ def _get_scores(self):
         print("Inertia Score: ", self.model.inertia_)
         print("Calinski Harabasz Score: ", metrics.calinski_harabasz_score(self.X, self.model.labels_))
         print("Silhouette Score: ", metrics.silhouette_score(self.X, self.model.labels_))
+        mlflow.log_metric("Inertia Score", self.model.inertia_)
+        mlflow.log_metric("Calinski Harabasz Score", metrics.calinski_harabasz_score(self.X, self.model.labels_))
+        mlflow.log_metric("Silhouette Score", metrics.silhouette_score(self.X, self.model.labels_))
 
     @staticmethod
     def _plot_silhouette_diagram(
@@ -191,20 +196,23 @@ def _plot_silhouette_diagram(
         print("-----* Silhouette Diagram *-----")
         plot_silhouette_diagram(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name)
         save_fig(f"Silhouette Diagram - {algorithm_name}", store_path)
+        mlflow.log_artifact(f"{store_path}/Silhouette Diagram - {algorithm_name}.png")
 
     @staticmethod
     def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None:
         """Plot the two-dimensional diagram of the clustering result."""
         print("-----* Cluster Two-Dimensional Diagram *-----")
         scatter2d(data, cluster_labels, algorithm_name)
         save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", store_path)
+        mlflow.log_artifact(f"{store_path}/Cluster Two-Dimensional Diagram - {algorithm_name}.png")
 
     @staticmethod
     def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None:
         """Plot the three-dimensional diagram of the clustering result."""
         print("-----* Cluster Three-Dimensional Diagram *-----")
         scatter3d(data, cluster_labels, algorithm_name)
         save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", store_path)
+        mlflow.log_artifact(f"{store_path}/Cluster Three-Dimensional Diagram - {algorithm_name}.png")
 
     def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""

diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py
@@ -7,7 +7,7 @@
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 
-from ..global_variable import MODEL_OUTPUT_IMAGE_PATH
+from ..constants import MODEL_OUTPUT_IMAGE_PATH
 from ..utils.base import save_fig
 from ._base import WorkflowBase
 from .func.algo_decomposition._pca import biplot, pca_manual_hyper_parameters, triplot

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_decision_tree.py b/geochemistrypi/data_mining/model/func/algo_classification/_decision_tree.py
@@ -4,8 +4,8 @@
 from rich import print
 from sklearn.tree import plot_tree
 
+from ....constants import SECTION
 from ....data.data_readiness import num_input, str_input
-from ....global_variable import SECTION
 
 
 def decision_tree_manual_hyper_parameters() -> Dict:

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_deep_neural_network.py b/geochemistrypi/data_mining/model/func/algo_classification/_deep_neural_network.py
@@ -3,8 +3,8 @@
 
 from rich import print
 
+from ....constants import SECTION
 from ....data.data_readiness import float_input, num_input, str_input, tuple_input
-from ....global_variable import SECTION
 
 
 def deep_neural_network_manual_hyper_parameters() -> Dict:

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py b/geochemistrypi/data_mining/model/func/algo_classification/_extra_trees.py
@@ -2,8 +2,8 @@
 
 from rich import print
 
+from ....constants import SECTION
 from ....data.data_readiness import num_input, str_input
-from ....global_variable import SECTION
 
 
 def extra_trees_manual_hyper_parameters() -> Dict:

diff --git a/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py b/geochemistrypi/data_mining/model/func/algo_classification/_logistic_regression.py
@@ -5,8 +5,8 @@
 import pandas as pd
 from rich import print
 
+from ....constants import SECTION
 from ....data.data_readiness import float_input, num_input, str_input
-from ....global_variable import SECTION
 
 
 def logistic_regression_manual_hyper_parameters() -> Dict: