Skip to content

Commit

Permalink
Merge pull request #169 from ZJUEarthData/web
Browse files Browse the repository at this point in the history
feat: add MLflow into current workflow.
  • Loading branch information
SanyHe authored Jun 28, 2023
2 parents d762317 + 32690e6 commit e8201f6
Show file tree
Hide file tree
Showing 39 changed files with 187 additions and 109 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ docs/build

# mlflow
mlruns
geopi_tracking
65 changes: 52 additions & 13 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# -*- coding: utf-8 -*-
import os
from time import sleep

import mlflow
from rich import print
from rich.console import Console
from rich.prompt import Confirm, Prompt

from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
from .data.feature_engineering import FeatureConstructor
from .data.imputation import imputer
from .data.preprocessing import feature_scaler
from .data.statistic import monte_carlo_simulator
from .global_variable import (
from .constants import (
CLASSIFICATION_MODELS,
CLUSTERING_MODELS,
DATASET_OUTPUT_PATH,
Expand All @@ -27,14 +26,21 @@
SECTION,
STATISTIC_IMAGE_PATH,
TEST_DATA_OPTION,
WORKING_PATH,
)
from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
from .data.feature_engineering import FeatureConstructor
from .data.imputation import imputer
from .data.preprocessing import feature_scaler
from .data.statistic import monte_carlo_simulator
from .plot.map_plot import map_projected
from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, logged_distribution_plot, probability_plot, ratio_null_vs_filled
from .process.classify import ClassificationModelSelection
from .process.cluster import ClusteringModelSelection
from .process.decompose import DecompositionModelSelection
from .process.regress import RegressionModelSelection
from .utils.base import clear_output, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id

# create the directories if they didn't exist yet
os.makedirs(MODEL_OUTPUT_IMAGE_PATH, exist_ok=True)
Expand All @@ -46,16 +52,47 @@


def cli_pipeline(file_name: str) -> None:
"""The command line interface for GeochemistryPy."""

print("Geochemistry Py v0.2.1 - Beta Version")
print("....... Initializing .......")
logger = log(OUTPUT_PATH, "inner_test.log")
logger.info("Geochemistry Py v.1.0.0 - beta version")
"""The command line interface for Geochemistry π."""

# If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True.
show_warning(False)

logger = log(OUTPUT_PATH, "inner_test.log")
logger.info("Geochemistry Pi is running.")

# Display the interactive splash screen when launching the CLI software
console = Console()
console.print("\n[bold blue]Welcome to Geochemistry Pi![/bold blue]")
console.print("[bold]Initializing...[/bold]")
with console.status("[bold green]Loading...[/bold green]", spinner="dots"):
sleep(2)
console.print("✨ Input Template [bold magenta][Option1/Option2][/bold magenta] [bold cyan](Default Value)[/bold cyan]: Input Value")
# Create a new experiment or use the previous experiment
is_used_previous_experiment = Confirm.ask("✨ Use Previous Experiment", default=False)
# Set the tracking uri to the local directory, in the future, we can set it to the remote server.
artifact_localtion = f"file:{WORKING_PATH}/geopi_tracking"
mlflow.set_tracking_uri(artifact_localtion)
# print("tracking uri:", mlflow.get_tracking_uri())
if is_used_previous_experiment:
old_experiment_id = None
# If the user doesn't provide the correct experiment name, then ask the user to input again.
while not old_experiment_id:
old_experiment_name = Prompt.ask("✨ Previous Experiment Name")
old_experiment_id = retrieve_previous_experiment_id(old_experiment_name)
mlflow.set_experiment(experiment_id=old_experiment_id)
experiment = mlflow.get_experiment(experiment_id=old_experiment_id)
else:
new_experiment_name = Prompt.ask("✨ New Experiment", default="GeoPi - Rock Classification")
new_experiment_tag = Prompt.ask("✨ Experiment Tag Version", default="E - v1.0.0")
new_experiment_id = mlflow.create_experiment(name=new_experiment_name, artifact_location=artifact_localtion, tags={"version": new_experiment_tag})
experiment = mlflow.get_experiment(experiment_id=new_experiment_id)
# print("Artifact Location: {}".format(experiment.artifact_location))

run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm")
run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})

# Data Loading
logger.debug("User Data Uploaded")
print("-*-*- User Data Loading -*-*-")
Expand Down Expand Up @@ -236,7 +273,7 @@ def cli_pipeline(file_name: str) -> None:
clear_output()
break
else:
save_data(data_processed_imputed, "Data Before Splitting", DATASET_OUTPUT_PATH)
save_data(data_processed_imputed, "Data-Before-Splitting", DATASET_OUTPUT_PATH)
clear_output()
break

Expand Down Expand Up @@ -366,3 +403,5 @@ def cli_pipeline(file_name: str) -> None:
run = Modes2Initiators[mode_num](MODELS[i])
run.activate(X, y, X_train, X_test, y_train, y_test)
clear_output()

mlflow.end_run()
67 changes: 67 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
@@ -1 +1,68 @@
import os

# The number of uploading dataset per user is limited to 5.
MAX_UPLOADS_PER_USER = 5

# current working directory in which the user activates the application
WORKING_PATH = os.getcwd()

# the directory in which the package(application) is installed
PACKAGEDIR = os.path.dirname(os.path.realpath(__file__))

# the directory where the built-in data set to be processed stays
BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")

# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "output")

# the directory where the data set produced stays
DATASET_OUTPUT_PATH = os.path.join(OUTPUT_PATH, "data")

# the directory where pictures saved
MODEL_OUTPUT_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "model_output")
STATISTIC_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "statistic")
MAP_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "map")
GEO_IMAGE_PATH = os.path.join(OUTPUT_PATH, "images", "geochemistry")

# the directory where the trained model saved
MODEL_PATH = os.path.join(OUTPUT_PATH, "trained_models")


# Tell which section the user is currently in on the UML
SECTION = ["User", "Data", "Model", "Plot"]

OPTION = ["Yes", "No"]
DATA_OPTION = ["Own Data", "Testing Data (Built-in)"]
TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction"]
MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction"]

# The model provided to use
REGRESSION_MODELS = [
"Linear Regression",
"Polynomial Regression",
"Support Vector Machine",
"Decision Tree",
"Random Forest",
"Extra-Trees",
"Xgboost",
"Deep Neural Network",
]
CLASSIFICATION_MODELS = [
"Logistic Regression",
"Support Vector Machine",
"Decision Tree",
"Random Forest",
"Extra-Trees",
"Xgboost",
"Deep Neural Network",
]
CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
DECOMPOSITION_MODELS = ["Principal Component Analysis", "T-SNE"]

# Special AutoML models
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
RAY_FLAML = ["Deep Neural Network"]

IMPUTING_STRATEGY = ["Mean Value", "Median Value", "Most Frequent Value", "Constant(Specified Value)"]

FEATURE_SCALING_STRATEGY = ["Min-max Scaling", "Standardization"]
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/data/data_readiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rich import print
from sklearn.model_selection import train_test_split

from ..global_variable import BUILT_IN_DATASET_PATH
from ..constants import BUILT_IN_DATASET_PATH

# from utils.exceptions import InvalidFileError

Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/data/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from rich import print
from sklearn.impute import SimpleImputer

from ..global_variable import SECTION
from ..constants import SECTION
from .data_readiness import float_input


Expand Down
65 changes: 0 additions & 65 deletions geochemistrypi/data_mining/global_variable.py

This file was deleted.

2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from multipledispatch import dispatch
from rich import print

from ..constants import MODEL_PATH, SECTION
from ..data.data_readiness import limit_num_input, num2option, num_input, show_data_columns
from ..global_variable import MODEL_PATH, SECTION
from ..utils.base import save_data


Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from ..global_variable import MODEL_OUTPUT_IMAGE_PATH, RAY_FLAML
from ..constants import MODEL_OUTPUT_IMAGE_PATH, RAY_FLAML
from ..utils.base import save_fig
from ._base import WorkflowBase
from .func.algo_classification._common import confusion_matrix_plot, cross_validation, plot_precision_recall, plot_ROC
Expand Down
10 changes: 9 additions & 1 deletion geochemistrypi/data_mining/model/clustering.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# -*- coding: utf-8 -*-
from typing import Dict, Optional, Union

import mlflow
import numpy as np
import pandas as pd
from rich import print
from sklearn import metrics
from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans

from ..global_variable import DATASET_OUTPUT_PATH, MODEL_OUTPUT_IMAGE_PATH
from ..constants import DATASET_OUTPUT_PATH, MODEL_OUTPUT_IMAGE_PATH
from ..utils.base import save_data, save_fig
from ._base import WorkflowBase
from .func.algo_clustering._dbscan import dbscan_manual_hyper_parameters, dbscan_result_plot
Expand All @@ -27,6 +28,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> None:
"""Fit the model according to the given training data."""
self.X = X
self.model.fit(X)
mlflow.log_params(self.model.get_params())

@classmethod
def manual_hyper_parameters(cls) -> Dict:
Expand Down Expand Up @@ -177,6 +179,9 @@ def _get_scores(self):
print("Inertia Score: ", self.model.inertia_)
print("Calinski Harabasz Score: ", metrics.calinski_harabasz_score(self.X, self.model.labels_))
print("Silhouette Score: ", metrics.silhouette_score(self.X, self.model.labels_))
mlflow.log_metric("Inertia Score", self.model.inertia_)
mlflow.log_metric("Calinski Harabasz Score", metrics.calinski_harabasz_score(self.X, self.model.labels_))
mlflow.log_metric("Silhouette Score", metrics.silhouette_score(self.X, self.model.labels_))

@staticmethod
def _plot_silhouette_diagram(
Expand All @@ -191,20 +196,23 @@ def _plot_silhouette_diagram(
print("-----* Silhouette Diagram *-----")
plot_silhouette_diagram(data, cluster_labels, cluster_centers_, n_clusters, algorithm_name)
save_fig(f"Silhouette Diagram - {algorithm_name}", store_path)
mlflow.log_artifact(f"{store_path}/Silhouette Diagram - {algorithm_name}.png")

@staticmethod
def _scatter2d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None:
"""Plot the two-dimensional diagram of the clustering result."""
print("-----* Cluster Two-Dimensional Diagram *-----")
scatter2d(data, cluster_labels, algorithm_name)
save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", store_path)
mlflow.log_artifact(f"{store_path}/Cluster Two-Dimensional Diagram - {algorithm_name}.png")

@staticmethod
def _scatter3d(data: pd.DataFrame, cluster_labels: pd.DataFrame, algorithm_name: str, store_path: str) -> None:
"""Plot the three-dimensional diagram of the clustering result."""
print("-----* Cluster Three-Dimensional Diagram *-----")
scatter3d(data, cluster_labels, algorithm_name)
save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", store_path)
mlflow.log_artifact(f"{store_path}/Cluster Three-Dimensional Diagram - {algorithm_name}.png")

def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
"""Invoke all special application functions for this algorithms by Scikit-learn framework."""
Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/model/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from ..global_variable import MODEL_OUTPUT_IMAGE_PATH
from ..constants import MODEL_OUTPUT_IMAGE_PATH
from ..utils.base import save_fig
from ._base import WorkflowBase
from .func.algo_decomposition._pca import biplot, pca_manual_hyper_parameters, triplot
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from rich import print
from sklearn.tree import plot_tree

from ....constants import SECTION
from ....data.data_readiness import num_input, str_input
from ....global_variable import SECTION


def decision_tree_manual_hyper_parameters() -> Dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from rich import print

from ....constants import SECTION
from ....data.data_readiness import float_input, num_input, str_input, tuple_input
from ....global_variable import SECTION


def deep_neural_network_manual_hyper_parameters() -> Dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from rich import print

from ....constants import SECTION
from ....data.data_readiness import num_input, str_input
from ....global_variable import SECTION


def extra_trees_manual_hyper_parameters() -> Dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import pandas as pd
from rich import print

from ....constants import SECTION
from ....data.data_readiness import float_input, num_input, str_input
from ....global_variable import SECTION


def logistic_regression_manual_hyper_parameters() -> Dict:
Expand Down
Loading

0 comments on commit e8201f6

Please sign in to comment.