From f4d903b5f40405c7fd6a5d27f228d7043b414b78 Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 15 Sep 2024 20:49:35 +0800 Subject: [PATCH 1/2] perf:Add a name to the data --- geochemistrypi/data_mining/cli_pipeline.py | 162 +++++++++++++----- .../data_mining/data/data_readiness.py | 70 +++++++- .../data_mining/data/feature_engineering.py | 7 +- geochemistrypi/data_mining/data/inference.py | 7 +- geochemistrypi/data_mining/model/_base.py | 40 +++-- .../data_mining/model/classification.py | 86 +++++++--- .../data_mining/model/clustering.py | 35 ++-- .../data_mining/model/decomposition.py | 29 ++-- geochemistrypi/data_mining/model/detection.py | 42 +++-- .../data_mining/model/regression.py | 87 +++++++++- geochemistrypi/data_mining/plot/map_plot.py | 14 +- .../data_mining/plot/statistic_plot.py | 12 +- .../data_mining/process/classify.py | 38 ++-- geochemistrypi/data_mining/process/cluster.py | 5 +- .../data_mining/process/decompose.py | 7 +- geochemistrypi/data_mining/process/detect.py | 17 +- geochemistrypi/data_mining/process/regress.py | 31 +++- geochemistrypi/data_mining/utils/base.py | 47 ++++- 18 files changed, 554 insertions(+), 182 deletions(-) diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 47796b30..ba66e9e4 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -4,6 +4,7 @@ from typing import Optional import mlflow +import pandas as pd from rich import print from rich.console import Console from rich.prompt import Confirm, Prompt @@ -33,7 +34,20 @@ TOGGLE_ADDRESS_STATUS, WORKING_PATH, ) -from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns, show_excel_columns +from .data.data_readiness import ( + basic_info, + create_sub_data_set, + data_split, + float_input, + limit_num_input, + np2pd, + num2option, + num_input, + read_data, + select_column_name, + show_data_columns, + show_excel_columns, +) from .data.feature_engineering import FeatureConstructor from .data.imputation import imputer from .data.inference import build_transform_pipeline, model_inference @@ -240,10 +254,19 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = elif is_built_in_inference_data and built_in_training_data_num == 5: inference_data = None + # <--- Name Selection ---> + logger.debug("Output Data Identifier Column Selection") + print("-*-*- Output Data Identifier Column Selection -*-*-") + show_data_columns(data.columns) + NAME = select_column_name(data) + clear_output() + name_column_origin = [] + name_column_select = data[NAME] + # <--- World Map Projection ---> logger.debug("World Map Projection") print("-*-*- World Map Projection -*-*-") - process_world_map(data) + process_world_map(data, name_column_select) # <--- Data Selection ---> logger.debug("Data Selection") @@ -257,12 +280,13 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print("-*-*- Basic Statistical Information -*-*-") basic_info(data_selected) basic_statistic(data_selected) - correlation_plot(data_selected.columns, data_selected) - distribution_plot(data_selected.columns, data_selected) - log_distribution_plot(data_selected.columns, data_selected) + correlation_plot(data_selected.columns, data_selected, name_column_select) + distribution_plot(data_selected.columns, data_selected, name_column_select) + log_distribution_plot(data_selected.columns, data_selected, name_column_select) GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") - save_data(data, "Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(data_selected, "Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(data, name_column_origin, "Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(data_selected, name_column_select, "Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + data_selected_name = pd.concat([name_column_select, data_selected], axis=1) clear_output() # <--- Missing Value Process ---> @@ -316,11 +340,16 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = data_selected_dropped = data_selected.dropna() # Reset the index of the data set after dropping the rows with missing values. data_selected_dropped = data_selected_dropped.reset_index(drop=True) + # Drop the rows with missing values + data_selected_dropped_name = data_selected_name.dropna() + # Reset the index of the data set after dropping the rows with missing values. + data_selected_dropped_name = data_selected_dropped_name.reset_index(drop=True) print("Successfully drop the rows with missing values.") print("The Selected Data Set After Dropping:") print(data_selected_dropped) print("Basic Statistical Information:") - save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + drop_name_column = data_selected_dropped_name.iloc[:, 0] + save_data(data_selected_dropped, drop_name_column, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) drop_rows_with_missing_value_flag = True imputed_flag = False elif drop_missing_value_strategy_num == 2: @@ -329,16 +358,22 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print("Note: The data set schema will remain the same after dropping the rows with missing values by specific columns.") drop_data_selected = create_sub_data_set(data_selected) data_selected_dropped = data_selected + data_selected_dropped_name = data_selected_name for column_name in drop_data_selected.columns: # Drop the rows with missing values data_selected_dropped = data_selected_dropped.dropna(subset=[column_name]) # Reset the index of the data set after dropping the rows with missing values. data_selected_dropped = data_selected_dropped.reset_index(drop=True) + # Drop the rows with missing values + data_selected_dropped_name = data_selected_dropped_name.dropna(subset=[column_name]) + # Reset the index of the data set after dropping the rows with missing values. + data_selected_dropped_name = data_selected_dropped_name.reset_index(drop=True) print("Successfully drop the rows with missing values.") print("The Selected Data Set After Dropping:") print(data_selected_dropped) print("Basic Statistical Information:") - save_data(data_selected_dropped, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + drop_name_column = data_selected_dropped_name.iloc[:, 0] + save_data(data_selected_dropped, drop_name_column, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) drop_rows_with_missing_value_flag = True imputed_flag = False missing_value_flag = check_missing_value(data_selected_dropped) @@ -352,14 +387,15 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = else: # Don't deal with the missing values, which means neither drop the rows with missing values nor use imputation techniques. imputed_flag = False - save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(data_selected, name_column_select, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() else: # If the selected data set doesn't have missing values, then don't deal with the missing values. imputed_flag = False - save_data(data_selected, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(data_selected, name_column_select, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() data_selected = data_selected_dropped if drop_rows_with_missing_value_flag else data_selected + drop_name_column = data_selected_dropped_name.iloc[:, 0] if drop_rows_with_missing_value_flag else name_column_select # If the selected data set contains missing values and the user wants to deal with the missing values and choose not to drop the rows with missing values, # then use imputation techniques to deal with the missing values. if imputed_flag: @@ -386,7 +422,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = probability_plot(data_selected.columns, data_selected, data_selected_imputed) basic_info(data_selected_imputed) basic_statistic(data_selected_imputed) - save_data(data_selected_imputed, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(data_selected_imputed, drop_name_column, "Data Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) del data_selected clear_output() else: @@ -397,7 +433,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Feature Engineering ---> logger.debug("Feature Engineering") print("-*-*- Feature Engineering -*-*-") - feature_builder = FeatureConstructor(data_selected_imputed) + feature_builder = FeatureConstructor(data_selected_imputed, drop_name_column) data_selected_imputed_fe = feature_builder.build() # feature_engineering_config is possible to be {} feature_engineering_config = feature_builder.config @@ -428,6 +464,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <--- Data Segmentation ---> # divide X and y data set when it is supervised learning logger.debug("Data Divsion") + name_all = drop_name_column if mode_num == 1 or mode_num == 2: # Supervised learning print("-*-*- Data Segmentation - X Set and Y Set -*-*-") @@ -442,7 +479,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print(X) print("Basic Statistical Information: ") basic_statistic(X) - save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(X, name_all, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() # Create Y data set @@ -458,7 +495,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print(y) print("Basic Statistical Information: ") basic_statistic(y) - save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(y, name_all, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() # <--- Feature Scaling ---> @@ -476,7 +513,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print(X) print("Basic Statistical Information: ") basic_statistic(X) - save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(X, name_all, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: feature_scaling_config = {} clear_output() @@ -492,7 +529,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = feature_selection_config, X = feature_selector(X, y, mode_num, FEATURE_SELECTION_STRATEGY, feature_selection_num - 1) print("--Selected Features-") show_data_columns(X.columns) - save_data(X, "X After Feature Selection", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(X, name_all, "X After Feature Selection", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: feature_selection_config = {} clear_output() @@ -501,16 +538,23 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print("-*-*- Data Split - Train Set and Test Set -*-*-") print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.") test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ") - train_test_data = data_split(X, y, test_ratio) + train_test_data = data_split(X, y, drop_name_column, test_ratio) for key, value in train_test_data.items(): + if key in ["Name Train", "Name Test"]: + continue print("-" * 25) print(f"The Selected Data Set: {key}") print(value) print(f"Basic Statistical Information: {key}") basic_statistic(value) - save_data(value, key, GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + if key == "X Train" or key == "Y Train": + data_name_column = train_test_data["Name Train"] + else: + data_name_column = train_test_data["Name Test"] + save_data(value, data_name_column, key, GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) X_train, X_test = train_test_data["X Train"], train_test_data["X Test"] y_train, y_test = train_test_data["Y Train"], train_test_data["Y Test"] + name_train, name_test = train_test_data["Name Train"], train_test_data["Name Test"] del data_selected_imputed_fe clear_output() else: @@ -532,7 +576,7 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print(X) print("Basic Statistical Information: ") basic_statistic(X) - save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(X, name_all, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: feature_scaling_config = {} clear_output() @@ -540,8 +584,8 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = feature_selection_config = {} # Create training data without data split because it is unsupervised learning X_train = X - y, X_test, y_train, y_test = None, None, None, None - + y, X_test, y_train, y_test, name_train, name_test = None, None, None, None, None, None + name_all = drop_name_column # <--- Model Selection ---> logger.debug("Model Selection") print("-*-*- Model Selection -*-*-") @@ -605,15 +649,17 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = new_feature_builder = FeatureConstructor(inference_data) inference_data_fe = new_feature_builder.batch_build(feature_engineering_config) inference_data_fe_selected = inference_data_fe[selected_columns] - save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe, "Application Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe_selected, "Application Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + inference_name_column = inference_data[NAME] + save_data(inference_data, name_column_origin, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe, inference_name_column, "Application Data Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected, inference_name_column, "Application Data Feature-Engineering Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: print("You have not applied feature engineering to the training data.") print("Hence, no feature engineering operation will be applied to the inference data.") inference_data_fe_selected = inference_data[selected_columns] - save_data(inference_data, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) - save_data(inference_data_fe_selected, "Application Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + inference_name_column = inference_data[NAME] + save_data(inference_data, name_column_origin, "Application Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_fe_selected, inference_name_column, "Application Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) else: # If the user doesn't provide the inference data path, it means that the user doesn't want to run the model inference. print("You did not enter inference data.") @@ -635,9 +681,9 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = run = Modes2Initiators[mode_num](model_name) # If is_automl is False, then run the model without AutoML. if not is_automl: - run.activate(X, y, X_train, X_test, y_train, y_test) + run.activate(X, y, X_train, X_test, y_train, y_test, name_train, name_test, name_all) else: - run.activate(X, y, X_train, X_test, y_train, y_test, is_automl) + run.activate(X, y, X_train, X_test, y_train, y_test, name_train, name_test, name_all, is_automl) clear_output() # <--- Transform Pipeline ---> @@ -655,11 +701,28 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print("-*-*- Model Inference -*-*-") if drop_rows_with_missing_value_flag: inference_data_fe_selected_dropped = inference_data_fe_selected.dropna() - model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline) - save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + inference_name_column = inference_data[NAME] + model_inference(inference_data_fe_selected_dropped, inference_name_column, is_inference, run, transformer_config, transform_pipeline) + save_data( + inference_data_fe_selected_dropped, + inference_name_column, + "Application Data Feature-Engineering Selected Dropped-Imputed", + GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, + MLFLOW_ARTIFACT_DATA_PATH, + ) else: - model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) + inference_name_column = inference_data[NAME] + model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline) clear_output() + + # <--- Data Dumping ---> + # In this section, convert the data in the output to the summary. + GEOPI_OUTPUT_SUMMARY_PATH = os.getenv("GEOPI_OUTPUT_SUMMARY_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + GEOPI_OUTPUT_PARAMETERS_PATH = os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH") + copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH, GEOPI_OUTPUT_METRICS_PATH, GEOPI_OUTPUT_PARAMETERS_PATH, GEOPI_OUTPUT_SUMMARY_PATH) + else: # Run all models for i in range(len(MODELS) - 1): @@ -669,14 +732,14 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = run = Modes2Initiators[mode_num](MODELS[i]) # If is_automl is False, then run all models without AutoML. if not is_automl: - run.activate(X, y, X_train, X_test, y_train, y_test) + run.activate(X, y, X_train, X_test, y_train, y_test, name_train, name_test, name_all) else: # If is_automl is True, but MODELS[i] is in the NON_AUTOML_MODELS, then run the model without AutoML. if MODELS[i] in NON_AUTOML_MODELS: - run.activate(X, y, X_train, X_test, y_train, y_test) + run.activate(X, y, X_train, X_test, y_train, y_test, name_train, name_test, name_all) else: # If is_automl is True, and MODELS[i] is not in the NON_AUTOML_MODELS, then run the model with AutoML. - run.activate(X, y, X_train, X_test, y_train, y_test, is_automl) + run.activate(X, y, X_train, X_test, y_train, y_test, name_train, name_test, name_all, is_automl) # <--- Transform Pipeline ---> # Construct the transform pipeline using sklearn.pipeline.make_pipeline method. @@ -692,17 +755,26 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = print("-*-*- Model Inference -*-*-") if drop_rows_with_missing_value_flag: inference_data_fe_selected_dropped = inference_data_fe_selected.dropna() - model_inference(inference_data_fe_selected_dropped, is_inference, run, transformer_config, transform_pipeline) - save_data(inference_data_fe_selected_dropped, "Application Data Feature-Engineering Selected Dropped-Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + inference_name_column = inference_data[NAME] + model_inference(inference_data_fe_selected_dropped, inference_name_column, is_inference, run, transformer_config, transform_pipeline) + save_data( + inference_data_fe_selected_dropped, + inference_name_column, + "Application Data Feature-Engineering Selected Dropped-Imputed", + GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, + MLFLOW_ARTIFACT_DATA_PATH, + ) else: - model_inference(inference_data_fe_selected, is_inference, run, transformer_config, transform_pipeline) + inference_name_column = inference_data[NAME] + model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline) clear_output() - # <--- Data Dumping ---> - # In this section, convert the data in the output to the summary. - GEOPI_OUTPUT_SUMMARY_PATH = os.getenv("GEOPI_OUTPUT_SUMMARY_PATH") - GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") - GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") - GEOPI_OUTPUT_PARAMETERS_PATH = os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH") - copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH, GEOPI_OUTPUT_METRICS_PATH, GEOPI_OUTPUT_PARAMETERS_PATH, GEOPI_OUTPUT_SUMMARY_PATH) + # <--- Data Dumping ---> + # In this section, convert the data in the output to the summary. + GEOPI_OUTPUT_SUMMARY_PATH = os.getenv("GEOPI_OUTPUT_SUMMARY_PATH") + GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH") + GEOPI_OUTPUT_METRICS_PATH = os.getenv("GEOPI_OUTPUT_METRICS_PATH") + GEOPI_OUTPUT_PARAMETERS_PATH = os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH") + copy_files(GEOPI_OUTPUT_ARTIFACTS_PATH, GEOPI_OUTPUT_METRICS_PATH, GEOPI_OUTPUT_PARAMETERS_PATH, GEOPI_OUTPUT_SUMMARY_PATH) + mlflow.end_run() diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py index e8fcb967..927f212f 100644 --- a/geochemistrypi/data_mining/data/data_readiness.py +++ b/geochemistrypi/data_mining/data/data_readiness.py @@ -9,7 +9,7 @@ from rich import print from sklearn.model_selection import train_test_split -from ..constants import BUILT_IN_DATASET_PATH +from ..constants import BUILT_IN_DATASET_PATH, SECTION # from utils.exceptions import InvalidFileError @@ -153,6 +153,33 @@ def select_columns(columns_range: Optional[str] = None) -> List[int]: return columns_selected +def select_column_name(data: pd.DataFrame) -> str: + """Select a single column from the dataframe and return its name. + + Parameters + ---------- + data : pd.DataFrame + The data set to be selected name. + """ + print( + "You need to choose the number of the column above as the output data identifier column.\n" + "The data identifier column helps identify uniquely each row of data point in the output data.\n" + "** For example, when using built-in dataset, you can choose the column ‘SAMPLE NAME’.**\n" + "Once finishing the whole run, in the output data file, all data point will have the value in the column ‘SAMPLE NAME’ as its unique identifier.\n" + "Enter the number of the output data identifier column." + ) + while True: + try: + column_index = int_input(column=2, prefix=SECTION[1], slogan="@Number: ") + if column_index < 1 or column_index > data.shape[1]: + print(f"The entered number is out of range! Please enter a number between 1 and {data.shape[1]}.") + continue + column_name = data.columns[column_index - 1] + return column_name + except ValueError: + print("Invalid input, please enter an integer.") + + def create_sub_data_set(data: pd.DataFrame, allow_empty_columns: bool = False) -> pd.DataFrame: """Create a sub data set. @@ -287,7 +314,7 @@ def create_sub_data_set(data: pd.DataFrame, allow_empty_columns: bool = False) - return sub_data_set -def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> Dict: +def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], names: pd.DataFrame, test_size: float = 0.2) -> Dict: """Split arrays or matrices into random train and test subsets. Parameters @@ -298,6 +325,9 @@ def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: fl y : pd.DataFrame or pd.Series The target variable to be split. + name : pd.DataFrame + The name of data. + test_size : float, default=0.2 Represents the proportion of the dataset to include in the test split. @@ -307,7 +337,8 @@ def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: fl A dictionary containing the split data. """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) - return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test} + name_train, name_test = train_test_split(names, test_size=test_size, random_state=42) + return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test, "Name Train": name_train, "Name Test": name_test} def num2option(items: List[str]) -> None: @@ -430,6 +461,39 @@ def float_input(default: float, prefix: Optional[str] = None, slogan: Optional[s return option +def int_input(column: int, prefix: Optional[str] = None, slogan: Optional[str] = "@Number: ") -> int: + """Get the number of the desired option. + + Parameters + ---------- + default: int + If the user does not enter anything, it is assigned to option. + + prefix : str, default=None + It indicates which section the user currently is in on the UML, which is shown on the command-line console. + + slogan : str, default="@Number: " + It acts like the first parameter of input function in Python, which output the hint. + + Returns + ------- + option: int + An option number. + """ + while True: + option = input(f"({prefix}) ➜ {slogan}").strip() + if option.isdigit(): + option = int(option) + break + elif len(option) == 0: + option = column + + break + else: + print("Caution: The input is not a positive integer number. Please input the right number again!") + return option + + def str_input(option_list: List[str], prefix: Optional[str] = None) -> str: """Get the string of the desired option. diff --git a/geochemistrypi/data_mining/data/feature_engineering.py b/geochemistrypi/data_mining/data/feature_engineering.py index 34ee7a23..5274fc42 100644 --- a/geochemistrypi/data_mining/data/feature_engineering.py +++ b/geochemistrypi/data_mining/data/feature_engineering.py @@ -20,7 +20,7 @@ class FeatureConstructor(object): # parenthesis = ['(', ')'] cal_words = ["pow", "sin", "cos", "tan", "pi", "mean", "std", "var", "log"] - def __init__(self, data: pd.DataFrame) -> None: + def __init__(self, data: pd.DataFrame, name_all: str) -> None: self.feature_name = None self.data = data self.alphabet = string.ascii_lowercase @@ -29,6 +29,7 @@ def __init__(self, data: pd.DataFrame) -> None: self.map_dict = {} self._result = None self.config = {} + self.name_all = name_all def index2name(self) -> None: """Show the index of columns in the data set. The display pattern is [letter : column name], e.g. a : 1st column name; b : 2nd column name.""" @@ -171,12 +172,12 @@ def build(self) -> None: clear_output() continue else: - save_data(self.data, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(self.data, self.name_all, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) print("Exit Feature Engineering Mode.") clear_output() break else: - save_data(self.data, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(self.data, self.name_all, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) clear_output() break return self.data diff --git a/geochemistrypi/data_mining/data/inference.py b/geochemistrypi/data_mining/data/inference.py index 29eb32a1..f2be3aed 100644 --- a/geochemistrypi/data_mining/data/inference.py +++ b/geochemistrypi/data_mining/data/inference.py @@ -109,7 +109,7 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di return transformer_config, transform_pipeline -def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None): +def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None): """Run the model inference. Parameters @@ -117,6 +117,9 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec inference_data : pd.DataFrame The inference data. + inference_name_column: str + The name of inference_data + is_inference : bool Whether to run the model inference. @@ -141,4 +144,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec inference_data_predicted_np = loaded_model.predict(inference_data_transformed) inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"]) GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH") - save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) + save_data(inference_data_predicted, inference_name_column, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH) diff --git a/geochemistrypi/data_mining/model/_base.py b/geochemistrypi/data_mining/model/_base.py index 38bd1611..a629e472 100644 --- a/geochemistrypi/data_mining/model/_base.py +++ b/geochemistrypi/data_mining/model/_base.py @@ -202,6 +202,9 @@ def data_upload( X_test: Optional[pd.DataFrame] = None, y_train: Optional[pd.DataFrame] = None, y_test: Optional[pd.DataFrame] = None, + name_train: Optional[pd.Series] = None, + name_test: Optional[pd.Series] = None, + name_all: Optional[pd.Series] = None, y_train_predict: Optional[pd.DataFrame] = None, y_test_predict: Optional[pd.DataFrame] = None, ) -> None: @@ -218,13 +221,19 @@ def data_upload( WorkflowBase.y_train = y_train if y_test is not None: WorkflowBase.y_test = y_test + if name_train is not None: + WorkflowBase.name_train = name_train + if name_test is not None: + WorkflowBase.name_test = name_test + if name_all is not None: + WorkflowBase.name_all = name_all if y_test_predict is not None: WorkflowBase.y_test_predict = y_test_predict if y_train_predict is not None: WorkflowBase.y_train_predict = y_train_predict @staticmethod - def data_save(df: pd.DataFrame, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None: + def data_save(df: pd.DataFrame, name: str, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None: """This method saves the data into the local path and the mlflow path. Parameters @@ -232,6 +241,8 @@ def data_save(df: pd.DataFrame, df_name: str, local_path: str, mlflow_path: str, df : pd.DataFrame The data to be saved. + name: str + The name. df_name : str The name of the data. @@ -246,7 +257,7 @@ def data_save(df: pd.DataFrame, df_name: str, local_path: str, mlflow_path: str, """ print(f"-----* {slogan} *-----") print(df) - save_data(df, df_name, local_path, mlflow_path) + save_data(df, name, df_name, local_path, mlflow_path) @staticmethod def save_hyper_parameters(hyper_parameters_dict: Dict, model_name: str, local_path: str) -> None: @@ -285,6 +296,7 @@ def model_save(self, is_automl: bool) -> None: def _plot_permutation_importance( X_test: pd.DataFrame, y_test: pd.DataFrame, + name_column: str, trained_model: object, image_config: dict, algorithm_name: str, @@ -295,8 +307,8 @@ def _plot_permutation_importance( print("-----* Permutation Importance Diagram *-----") importances_mean, importances_std, importances = plot_permutation_importance(X_test, y_test, trained_model, image_config) save_fig(f"Permutation Importance - {algorithm_name}", local_path, mlflow_path) - save_data(X_test, "Permutation Importance - X Test", local_path, mlflow_path) - save_data(y_test, "Permutation Importance - Y Test", local_path, mlflow_path) + save_data(X_test, name_column, "Permutation Importance - X Test", local_path, mlflow_path) + save_data(y_test, name_column, "Permutation Importance - Y Test", local_path, mlflow_path) data_dict = {"importances_mean": importances_mean.tolist(), "importances_std": importances_std.tolist(), "importances": importances.tolist()} data_str = json.dumps(data_dict, indent=4) save_text(data_str, f"Permutation Importance - {algorithm_name}", local_path, mlflow_path) @@ -306,14 +318,14 @@ class TreeWorkflowMixin: """Mixin class for tree models.""" @staticmethod - def _plot_feature_importance(X_train: pd.DataFrame, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_feature_importance(X_train: pd.DataFrame, name_column: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Draw the feature importance bar diagram.""" print("-----* Feature Importance Diagram *-----") columns_name = X_train.columns feature_importances = trained_model.feature_importances_ data = plot_feature_importance(columns_name, feature_importances, image_config) save_fig(f"Feature Importance - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True) + save_data(data, name_column, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True) @staticmethod def _plot_tree(trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: @@ -337,40 +349,40 @@ def _show_formula( save_text(formula_str, f"{algorithm_name} Formula", local_path, mlflow_path) @staticmethod - def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the 2D graph of the linear regression model.""" print("-----* 2D Scatter Diagram *-----") plot_2d_scatter_diagram(feature_data, target_data) save_fig(f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([feature_data, target_data], axis=1) - save_data(data, f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, data_name, f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_2d_line_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_2d_line_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the 2D graph of the linear regression model.""" print("-----* 2D Line Diagram *-----") plot_2d_line_diagram(feature_data, target_data, y_test_predict) save_fig(f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([feature_data, target_data, y_test_predict], axis=1) - save_data(data, f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, data_name, f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the 3D graph of the linear regression model.""" print("-----* 3D Scatter Diagram *-----") plot_3d_scatter_diagram(feature_data, target_data) save_fig(f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([feature_data, target_data], axis=1) - save_data(data, f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, data_name, f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_3d_surface_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_3d_surface_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the 3D graph of the linear regression model.""" print("-----* 3D Surface Diagram *-----") plot_3d_surface_diagram(feature_data, target_data, y_test_predict) save_fig(f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([feature_data, target_data, y_test_predict], axis=1) - save_data(data, f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, data_name, f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path) class ClusteringMetricsMixin: diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index b81edd66..8af681f6 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -153,7 +153,7 @@ def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd. save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path) @staticmethod - def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the confusion matrix of the model.""" print("-----* Confusion Matrix *-----") data = plot_confusion_matrix(y_test, y_test_predict, trained_model) @@ -161,10 +161,10 @@ def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, t index = [f"true_{i}" for i in range(int(y_test.nunique().values))] columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))] data = pd.DataFrame(data, columns=columns, index=index) - save_data(data, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True) + save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True) @staticmethod - def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: print(f"-----* {graph_name} *-----") y_probs, precisions, recalls, thresholds = plot_precision_recall(X_test, y_test, trained_model, graph_name, algorithm_name) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) @@ -172,11 +172,13 @@ def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_m precisions = pd.DataFrame(precisions, columns=["Precisions"]) recalls = pd.DataFrame(recalls, columns=["Recalls"]) thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) - save_data(precisions, f"{graph_name} - Precisions", local_path, mlflow_path) - save_data(recalls, f"{graph_name} - Recalls", local_path, mlflow_path) + save_data(precisions, name_column, f"{graph_name} - Precisions", local_path, mlflow_path) + save_data(recalls, name_column, f"{graph_name} - Recalls", local_path, mlflow_path) @staticmethod - def _plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_precision_recall_threshold( + X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str + ) -> None: print(f"-----* {graph_name} *-----") y_probs, precisions, recalls, thresholds = plot_precision_recall_threshold(X_test, y_test, trained_model, graph_name, algorithm_name) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) @@ -184,13 +186,13 @@ def _plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame, precisions = pd.DataFrame(precisions, columns=["Precisions"]) recalls = pd.DataFrame(recalls, columns=["Recalls"]) thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) - save_data(y_probs, f"{graph_name} - Probabilities", local_path, mlflow_path) - save_data(precisions, f"{graph_name} - Precisions", local_path, mlflow_path) - save_data(recalls, f"{graph_name} - Recalls", local_path, mlflow_path) - save_data(thresholds, f"{graph_name} - Thresholds", local_path, mlflow_path) + save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path) + save_data(precisions, name_column, f"{graph_name} - Precisions", local_path, mlflow_path) + save_data(recalls, name_column, f"{graph_name} - Recalls", local_path, mlflow_path) + save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path) @staticmethod - def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: print("-----* ROC Curve *-----") y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name) save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path) @@ -198,22 +200,24 @@ def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, fpr = pd.DataFrame(fpr, columns=["False Positive Rate"]) tpr = pd.DataFrame(tpr, columns=["True Positive Rate"]) thresholds = pd.DataFrame(thresholds, columns=["Thresholds"]) - save_data(y_probs, "ROC Curve - Probabilities", local_path, mlflow_path) - save_data(fpr, "ROC Curve - False Positive Rate", local_path, mlflow_path) - save_data(tpr, "ROC Curve - True Positive Rate", local_path, mlflow_path) - save_data(thresholds, "ROC Curve - Thresholds", local_path, mlflow_path) + save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path) + save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path) + save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path) + save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path) @staticmethod - def _plot_2d_decision_boundary(X: pd.DataFrame, X_test: pd.DataFrame, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_2d_decision_boundary( + X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str + ) -> None: """Plot the decision boundary of the trained model with the testing data set below.""" print("-----* Two-dimensional Decision Boundary Diagram *-----") plot_2d_decision_boundary(X, X_test, trained_model, image_config) save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path) - save_data(X, "Decision Boundary - X", local_path, mlflow_path) - save_data(X_test, "Decision Boundary - X Test", local_path, mlflow_path) + save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path) + save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path) @staticmethod - def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, local_path: str, mlflow_path: str) -> tuple: + def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple: """Use this method when the sample size is unbalanced.""" print("-*-*- Sample Balance on Train Set -*-*-") num2option(OPTION) @@ -228,15 +232,17 @@ def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, local_path: str print(train_set_resampled) print("Basic Statistical Information: ") basic_statistic(train_set_resampled) - save_data(X_train, "X Train After Sample Balance", local_path, mlflow_path) - save_data(y_train, "Y Train After Sample Balance", local_path, mlflow_path) + save_data(X_train, name_column, "X Train After Sample Balance", local_path, mlflow_path) + save_data(y_train, name_column, "Y Train After Sample Balance", local_path, mlflow_path) else: sample_balance_config = None clear_output() return sample_balance_config, X_train, y_train @staticmethod - def customize_label(y: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, local_path: str, mlflow_path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + def customize_label( + y: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, name_column1: str, name_column2: str, name_column3: str, local_path: str, mlflow_path: str + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Using this function to customize the label to which samples of each category belong.""" print("-*-*- Customize Label on Label Set -*-*-") num2option(OPTION) @@ -253,9 +259,9 @@ def customize_label(y: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame print("------------------------------------") print("Originla label VS Customizing label:") print(y_show) - save_data(y, "Y Set After Customizing label", local_path, mlflow_path) - save_data(y_train, "Y Train After Customizing label", local_path, mlflow_path) - save_data(y_test, "Y Test After Customizing label", local_path, mlflow_path) + save_data(y, name_column1, "Y Set After Customizing label", local_path, mlflow_path) + save_data(y_train, name_column2, "Y Train After Customizing label", local_path, mlflow_path) + save_data(y_test, name_column3, "Y Test After Customizing label", local_path, mlflow_path) clear_output() return y, y_train, y_test @@ -288,6 +294,7 @@ def common_components(self) -> None: self._plot_confusion_matrix( y_test=ClassificationWorkflowBase.y_test, y_test_predict=ClassificationWorkflowBase.y_test_predict, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -297,6 +304,7 @@ def common_components(self) -> None: self._plot_precision_recall( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, graph_name=ClassificationCommonFunction.PRECISION_RECALL_CURVE.value, algorithm_name=self.naming, @@ -306,6 +314,7 @@ def common_components(self) -> None: self._plot_precision_recall_threshold( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, graph_name=ClassificationCommonFunction.PRECISION_RECALL_THRESHOLD_DIAGRAM.value, algorithm_name=self.naming, @@ -315,6 +324,7 @@ def common_components(self) -> None: self._plot_ROC( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -323,6 +333,7 @@ def common_components(self) -> None: self._plot_permutation_importance( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -333,6 +344,8 @@ def common_components(self) -> None: self._plot_2d_decision_boundary( X=ClassificationWorkflowBase.X, X_test=ClassificationWorkflowBase.X_test, + name_column1=ClassificationWorkflowBase.name_all, + name_column2=ClassificationWorkflowBase.name_test, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -369,6 +382,7 @@ def common_components(self, is_automl: bool) -> None: self._plot_confusion_matrix( y_test=ClassificationWorkflowBase.y_test, y_test_predict=ClassificationWorkflowBase.y_test_predict, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -378,6 +392,7 @@ def common_components(self, is_automl: bool) -> None: self._plot_precision_recall( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, graph_name=ClassificationCommonFunction.PRECISION_RECALL_CURVE.value, algorithm_name=self.naming, @@ -387,6 +402,7 @@ def common_components(self, is_automl: bool) -> None: self._plot_precision_recall_threshold( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, graph_name=ClassificationCommonFunction.PRECISION_RECALL_THRESHOLD_DIAGRAM.value, algorithm_name=self.naming, @@ -396,6 +412,7 @@ def common_components(self, is_automl: bool) -> None: self._plot_ROC( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -404,6 +421,7 @@ def common_components(self, is_automl: bool) -> None: self._plot_permutation_importance( X_test=ClassificationWorkflowBase.X_test, y_test=ClassificationWorkflowBase.y_test, + name_column=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -414,6 +432,8 @@ def common_components(self, is_automl: bool) -> None: self._plot_2d_decision_boundary( X=ClassificationWorkflowBase.X, X_test=ClassificationWorkflowBase.X_test, + name_column1=ClassificationWorkflowBase.name_all, + name_column2=ClassificationWorkflowBase.name_test, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -902,6 +922,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=DecisionTreeClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -922,6 +943,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=DecisionTreeClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -1219,6 +1241,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=RandomForestClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -1239,6 +1262,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=RandomForestClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -1596,6 +1620,7 @@ def special_components(self, **kwargs) -> None: # ) self._plot_feature_importance( X_train=XGBoostClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -1609,6 +1634,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=XGBoostClassification.X_train, + name_column=DecisionTreeClassification.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -1853,12 +1879,12 @@ def manual_hyper_parameters(cls) -> Dict: return hyper_parameters @staticmethod - def _plot_feature_importance(columns_name: np.ndarray, trained_model: any, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_feature_importance(columns_name: np.ndarray, name_column: str, trained_model: any, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Print the feature coefficient value orderly.""" print("-----* Feature Importance *-----") data = plot_logistic_importance(columns_name, trained_model) save_fig(f"Feature Importance - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"Feature Importance - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Feature Importance - {algorithm_name}", local_path, mlflow_path) @dispatch() def special_components(self, **kwargs) -> None: @@ -1877,6 +1903,7 @@ def special_components(self, **kwargs) -> None: ) self._plot_feature_importance( columns_name=LogisticRegressionClassification.X.columns, + name_column=LogisticRegressionClassification.name_all, trained_model=self.model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -1900,6 +1927,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: ) self._plot_feature_importance( columns_name=LogisticRegressionClassification.X.columns, + name_column=LogisticRegressionClassification.name_all, trained_model=self.auto_model, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -2513,6 +2541,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=ExtraTreesClassification.X_train, + name_column=LogisticRegressionClassification.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -2533,6 +2562,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=ExtraTreesClassification.X_train, + name_column=LogisticRegressionClassification.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -2876,6 +2906,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=GradientBoostingClassification.X_train, + name_column=LogisticRegressionClassification.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -2896,6 +2927,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=GradientBoostingClassification.X_train, + name_column=LogisticRegressionClassification.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, diff --git a/geochemistrypi/data_mining/model/clustering.py b/geochemistrypi/data_mining/model/clustering.py index b9943ec9..109da3a1 100644 --- a/geochemistrypi/data_mining/model/clustering.py +++ b/geochemistrypi/data_mining/model/clustering.py @@ -45,7 +45,7 @@ def manual_hyper_parameters(cls) -> Dict: return dict() @staticmethod - def _get_cluster_centers(func_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> Optional[pd.DataFrame]: + def _get_cluster_centers(func_name: str, trained_model: object, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> Optional[pd.DataFrame]: """Get the cluster centers.""" print(f"-----* {func_name} *-----") cluster_centers = getattr(trained_model, "cluster_centers_", None) @@ -58,16 +58,16 @@ def _get_cluster_centers(func_name: str, trained_model: object, algorithm_name: print(cluster_centers) cluster_centers = pd.DataFrame(cluster_centers, columns=column_name) - save_data(cluster_centers, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + save_data(cluster_centers, name_column, f"{func_name} - {algorithm_name}", local_path, mlflow_path) return cluster_centers @staticmethod - def _get_cluster_labels(func_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> pd.DataFrame: + def _get_cluster_labels(func_name: str, trained_model: object, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> pd.DataFrame: """Get the cluster labels.""" print(f"-----* {func_name} *-----") cluster_label = pd.DataFrame(trained_model.labels_, columns=[func_name]) print(cluster_label) - save_data(cluster_label, f"{func_name} - {algorithm_name}", local_path, mlflow_path) + save_data(cluster_label, name_column, f"{func_name} - {algorithm_name}", local_path, mlflow_path) return cluster_label @staticmethod @@ -80,43 +80,43 @@ def _score(data: pd.DataFrame, labels: pd.Series, func_name: str, algorithm_name mlflow.log_metrics(scores) @staticmethod - def _scatter2d(data: pd.DataFrame, labels: pd.Series, cluster_centers_: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _scatter2d(data: pd.DataFrame, labels: pd.Series, name_column: str, cluster_centers_: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the two-dimensional diagram of the clustering result.""" print("-----* Cluster Two-Dimensional Diagram *-----") scatter2d(data, labels, cluster_centers_, algorithm_name) save_fig(f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data_with_labels, name_column, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _scatter3d(data: pd.DataFrame, labels: pd.Series, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _scatter3d(data: pd.DataFrame, labels: pd.Series, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the three-dimensional diagram of the clustering result.""" print("-----* Cluster Three-Dimensional Diagram *-----") scatter3d(data, labels, algorithm_name) save_fig(f"Cluster Three-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data_with_labels, name_column, f"Cluster Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.Series, model: object, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_silhouette_diagram(data: pd.DataFrame, labels: pd.Series, name_column: str, model: object, cluster_centers_: np.ndarray, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the silhouette diagram of the clustering result.""" print("-----* Silhouette Diagram *-----") plot_silhouette_diagram(data, labels, cluster_centers_, model, algorithm_name) save_fig(f"Silhouette Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, "Silhouette Diagram - Data With Labels", local_path, mlflow_path) + save_data(data_with_labels, name_column, "Silhouette Diagram - Data With Labels", local_path, mlflow_path) if not isinstance(cluster_centers_, str): cluster_center_data = pd.DataFrame(cluster_centers_, columns=data.columns) - save_data(cluster_center_data, "Silhouette Diagram - Cluster Centers", local_path, mlflow_path) + save_data(cluster_center_data, name_column, "Silhouette Diagram - Cluster Centers", local_path, mlflow_path) @staticmethod - def _plot_silhouette_value_diagram(data: pd.DataFrame, labels: pd.Series, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_silhouette_value_diagram(data: pd.DataFrame, labels: pd.Series, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the silhouette value diagram of the clustering result.""" print("-----* Silhouette value Diagram *-----") plot_silhouette_value_diagram(data, labels, algorithm_name) save_fig(f"Silhouette value Diagram - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, "Silhouette value Diagram - Data With Labels", local_path, mlflow_path) + save_data(data_with_labels, name_column, "Silhouette value Diagram - Data With Labels", local_path, mlflow_path) def common_components(self) -> None: """Invoke all common application functions for clustering algorithms.""" @@ -126,6 +126,7 @@ def common_components(self) -> None: self.cluster_centers = self._get_cluster_centers( func_name=ClusteringCommonFunction.CLUSTER_CENTERS.value, trained_model=self.model, + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, mlflow_path=MLFLOW_ARTIFACT_DATA_PATH, @@ -133,6 +134,7 @@ def common_components(self) -> None: self.cluster_labels = self._get_cluster_labels( func_name=ClusteringCommonFunction.CLUSTER_LABELS.value, trained_model=self.model, + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, mlflow_path=MLFLOW_ARTIFACT_DATA_PATH, @@ -150,6 +152,7 @@ def common_components(self) -> None: self._scatter2d( data=two_dimen_data, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, cluster_centers_=self.cluster_centers, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -161,6 +164,7 @@ def common_components(self) -> None: self._scatter3d( data=three_dimen_data, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -171,6 +175,7 @@ def common_components(self) -> None: self._scatter2d( data=two_dimen_data, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, cluster_centers_=self.cluster_centers, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -181,6 +186,7 @@ def common_components(self) -> None: self._scatter3d( data=self.X, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -189,6 +195,7 @@ def common_components(self) -> None: self._scatter2d( data=self.X, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, cluster_centers_=self.cluster_centers, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -200,6 +207,7 @@ def common_components(self) -> None: self._plot_silhouette_diagram( data=self.X, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, cluster_centers_=self.cluster_centers, model=self.model, algorithm_name=self.naming, @@ -209,6 +217,7 @@ def common_components(self) -> None: self._plot_silhouette_value_diagram( data=self.X, labels=self.cluster_labels[ClusteringCommonFunction.CLUSTER_LABELS.value], + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py index 20d3dc69..7cf3ba3f 100644 --- a/geochemistrypi/data_mining/model/decomposition.py +++ b/geochemistrypi/data_mining/model/decomposition.py @@ -10,7 +10,7 @@ from sklearn.manifold import MDS, TSNE from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH -from ..utils.base import clear_output, save_data, save_fig +from ..utils.base import clear_output, save_data, save_data_special, save_fig from ._base import WorkflowBase from .func.algo_decomposition._common import plot_2d_scatter_diagram, plot_contour, plot_heatmap from .func.algo_decomposition._enum import DecompositionCommonFunction, PCASpecialFunction @@ -67,28 +67,28 @@ def _reduced_data2pd(self, reduced_data: np.ndarray, components_num: int) -> Non self.X_reduced.columns = pa_name @staticmethod - def _plot_2d_scatter_diagram(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_2d_scatter_diagram(data: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the two-dimensional diagram of the decomposition result.""" print("-----* Decomposition Two-Dimensional Diagram *-----") plot_2d_scatter_diagram(data, algorithm_name) save_fig(f"Decomposition Two-Dimensional Diagram - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"Decomposition Two-Dimensional Data - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Decomposition Two-Dimensional Data - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_heatmap(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_heatmap(data: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot a heatmap for the decomposition result.""" print("-----* Decomposition Heatmap *-----") plot_heatmap(data, algorithm_name) save_fig(f"Decomposition Heatmap - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"Decomposition Heatmap Data - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Decomposition Heatmap Data - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_contour(data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_contour(data: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot a contour plot for dimensionality reduction results.""" print("-----* Dimensionality Reduction Contour Plot *-----") plot_contour(data, algorithm_name) save_fig(f"Dimensionality Reduction Contour Plot - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"Dimensionality Reduction Contour Plot Data - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Dimensionality Reduction Contour Plot Data - {algorithm_name}", local_path, mlflow_path) def common_components(self) -> None: """Invoke all common application functions for decomposition algorithms by Scikit-learn framework.""" @@ -96,18 +96,21 @@ def common_components(self) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_2d_scatter_diagram( data=self.X, + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, ) self._plot_heatmap( data=self.X, + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, ) self._plot_contour( data=self.X, + name_column=self.name_all, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -295,18 +298,18 @@ def _biplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, """Draw bi-plot.""" print(f"-----* {graph_name} *-----") biplot(reduced_data, pc_data, algorithm_name) - save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) - save_data(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path) - save_data(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path) + save_fig(f"Compositional Bi-plot - {algorithm_name}", local_path, mlflow_path) + save_data_special(reduced_data, "Compositional Bi-plot - Reduced Data", local_path, mlflow_path) + save_data_special(pc_data, "Compositional Bi-plot - PC Data", local_path, mlflow_path) @staticmethod def _triplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Draw tri-plot.""" print(f"-----* {graph_name} *-----") triplot(reduced_data, pc_data, algorithm_name) - save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) - save_data(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path) - save_data(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path) + save_fig(f"Compositional Tri-plot - {algorithm_name}", local_path, mlflow_path) + save_data_special(reduced_data, "Compositional Tri-plot - Reduced Data", local_path, mlflow_path) + save_data_special(pc_data, "Compositional Tri-plot - PC Data", local_path, mlflow_path) def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" diff --git a/geochemistrypi/data_mining/model/detection.py b/geochemistrypi/data_mining/model/detection.py index 8848bea0..64b049e5 100644 --- a/geochemistrypi/data_mining/model/detection.py +++ b/geochemistrypi/data_mining/model/detection.py @@ -43,7 +43,7 @@ def manual_hyper_parameters(cls) -> Dict: return dict() @staticmethod - def _detect_data(X: pd.DataFrame, detect_label: np.ndarray) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + def _detect_data(X: pd.DataFrame, name_column: str, detect_label: np.ndarray) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Merge the detection results into the source data. Parameters @@ -51,6 +51,9 @@ def _detect_data(X: pd.DataFrame, detect_label: np.ndarray) -> tuple[pd.DataFram X : pd.DataFrame The original data. + name_column: str + Name of data. + detect_label : np.ndarray The detection labels for each data point. @@ -64,41 +67,50 @@ def _detect_data(X: pd.DataFrame, detect_label: np.ndarray) -> tuple[pd.DataFram X_anomaly : pd.DataFrame DataFrame containing the anomaly data points. + + name_normal : str + Name of normal data. + + name_abnormal + Name of anomaly data. + """ X_anomaly_detection = X.copy() # Merge detection results into the source data - X_anomaly_detection["is_anomaly"] = detect_label - X_normal = X_anomaly_detection[X_anomaly_detection["is_anomaly"] == 1] - X_anomaly = X_anomaly_detection[X_anomaly_detection["is_anomaly"] == -1] + X_anomaly_detection["is_abnormal"] = detect_label + X_normal = X_anomaly_detection[X_anomaly_detection["is_abnormal"] == 1] + name_normal = name_column[X_anomaly_detection["is_abnormal"] == 1] + X_abnormal = X_anomaly_detection[X_anomaly_detection["is_abnormal"] == -1] + name_abnormal = name_column[X_anomaly_detection["is_abnormal"] == -1] - return X_anomaly_detection, X_normal, X_anomaly + return X_anomaly_detection, X_normal, X_abnormal, name_normal, name_abnormal @staticmethod - def _density_estimation(data: pd.DataFrame, labels: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _density_estimation(data: pd.DataFrame, name_column: str, labels: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the density estimation diagram of the anomaly detection result.""" print(f"-----* {graph_name} *-----") density_estimation(data, labels, algorithm_name=algorithm_name) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data(data_with_labels, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _scatter2d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, graph_name: str, local_path: str, mlflow_path: str) -> None: + def _scatter2d(data: pd.DataFrame, name_column: str, labels: pd.DataFrame, algorithm_name: str, graph_name: str, local_path: str, mlflow_path: str) -> None: """Plot the two-dimensional diagram of the anomaly detection result.""" print(f"-----* {graph_name} *-----") scatter2d(data, labels, algorithm_name=algorithm_name) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data(data_with_labels, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _scatter3d(data: pd.DataFrame, labels: pd.DataFrame, algorithm_name: str, graph_name: str, local_path: str, mlflow_path: str) -> None: + def _scatter3d(data: pd.DataFrame, name_column: str, labels: pd.DataFrame, algorithm_name: str, graph_name: str, local_path: str, mlflow_path: str) -> None: """Plot the three-dimensional diagram of the anomaly detection result.""" print(f"-----* {graph_name} *-----") scatter3d(data, labels, algorithm_name=algorithm_name) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) data_with_labels = pd.concat([data, labels], axis=1) - save_data(data_with_labels, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data(data_with_labels, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path) def common_components(self) -> None: """Invoke all common application functions for anomaly detection algorithms by Scikit-learn framework.""" @@ -107,6 +119,7 @@ def common_components(self) -> None: two_dimen_axis_index, two_dimen_data = self.choose_dimension_data(self.X, 2) self._scatter2d( data=two_dimen_data, + name_column=self.name_all, labels=self.anomaly_detection_result, algorithm_name=self.naming, graph_name=AnormalyDetectionCommonFunction.PLOT_SCATTER_2D.value, @@ -117,6 +130,7 @@ def common_components(self) -> None: three_dimen_axis_index, three_dimen_data = self.choose_dimension_data(self.X, 3) self._scatter3d( data=three_dimen_data, + name_column=self.name_all, labels=self.anomaly_detection_result, algorithm_name=self.naming, graph_name=AnormalyDetectionCommonFunction.PLOT_SCATTER_3D.value, @@ -126,6 +140,7 @@ def common_components(self) -> None: self._density_estimation( data=self.X, + name_column=self.name_all, labels=self.anomaly_detection_result, algorithm_name=self.naming, graph_name=AnormalyDetectionCommonFunction.DENSITY_ESTIMATION.value, @@ -442,13 +457,13 @@ def manual_hyper_parameters(cls) -> Dict: return hyper_parameters @staticmethod - def _plot_lof_scores(X_train: pd.DataFrame, lof_scores: np.ndarray, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_lof_scores(X_train: pd.DataFrame, name_column_train: str, lof_scores: np.ndarray, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Draw the LOF scores bar diagram.""" print(f"-----* {graph_name} *-----") columns_name = X_train.index data = plot_lof_scores(columns_name, lof_scores, image_config) save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) - save_data(data, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True) + save_data(data, name_column_train, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True) def special_components(self, **kwargs) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" @@ -456,6 +471,7 @@ def special_components(self, **kwargs) -> None: lof_scores = self.model.negative_outlier_factor_ self._plot_lof_scores( X_train=self.X_train, + name_column_train=self.name_all, lof_scores=lof_scores, image_config=self.image_config, algorithm_name=self.naming, diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index 06a33c5b..8b38c4db 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -121,22 +121,22 @@ def manual_hyper_parameters(cls) -> Dict: return dict() @staticmethod - def _plot_predicted_vs_actual(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_predicted_vs_actual(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the predicted vs. actual diagram.""" print("-----* Predicted vs. Actual Diagram *-----") plot_predicted_vs_actual(y_test_predict, y_test, algorithm_name) save_fig(f"Predicted vs. Actual Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([y_test, y_test_predict], axis=1) - save_data(data, f"Predicted vs. Actual Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Predicted vs. Actual Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod - def _plot_residuals(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None: + def _plot_residuals(y_test_predict: pd.DataFrame, y_test: pd.DataFrame, name_column: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Plot the residuals diagram.""" print("-----* Residuals Diagram *-----") residuals = plot_residuals(y_test_predict, y_test, algorithm_name) save_fig(f"Residuals Diagram - {algorithm_name}", local_path, mlflow_path) data = pd.concat([y_test, residuals], axis=1) - save_data(data, f"Residuals Diagram - {algorithm_name}", local_path, mlflow_path) + save_data(data, name_column, f"Residuals Diagram - {algorithm_name}", local_path, mlflow_path) @staticmethod def _score(y_true: pd.DataFrame, y_predict: pd.DataFrame, algorithm_name: str, store_path: str) -> None: @@ -178,6 +178,7 @@ def common_components(self) -> None: self._plot_predicted_vs_actual( y_test_predict=RegressionWorkflowBase.y_test_predict, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -185,6 +186,7 @@ def common_components(self) -> None: self._plot_residuals( y_test_predict=RegressionWorkflowBase.y_test_predict, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -192,6 +194,7 @@ def common_components(self) -> None: self._plot_permutation_importance( X_test=RegressionWorkflowBase.X_test, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -221,6 +224,7 @@ def common_components(self, is_automl: bool = False) -> None: self._plot_predicted_vs_actual( y_test_predict=RegressionWorkflowBase.y_test_predict, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -228,6 +232,7 @@ def common_components(self, is_automl: bool = False) -> None: self._plot_residuals( y_test_predict=RegressionWorkflowBase.y_test_predict, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -235,6 +240,7 @@ def common_components(self, is_automl: bool = False) -> None: self._plot_permutation_importance( X_test=RegressionWorkflowBase.X_test, y_test=RegressionWorkflowBase.y_test, + name_column=RegressionWorkflowBase.name_test, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -634,6 +640,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=XGBoostRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -655,6 +662,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=XGBoostRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -909,6 +917,7 @@ def special_components(self, **kwargs): GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=DecisionTreeRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -929,6 +938,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=DecisionTreeRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -1205,6 +1215,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=ExtraTreesRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -1225,6 +1236,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=ExtraTreesRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -1503,6 +1515,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=RandomForestRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -1523,6 +1536,7 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=RandomForestRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -2123,6 +2137,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2132,6 +2147,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2142,6 +2158,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2150,6 +2167,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=ClassicalLinearRegression.X_test, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2157,6 +2175,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=ClassicalLinearRegression.X_test, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, y_test_predict=ClassicalLinearRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -2167,6 +2186,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=ClassicalLinearRegression.X_test, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2174,6 +2194,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=ClassicalLinearRegression.X_test, target_data=ClassicalLinearRegression.y_test, + data_name=ClassicalLinearRegression.name_test, y_test_predict=ClassicalLinearRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -2667,6 +2688,7 @@ def special_components(self, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=GradientBoostingRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.model, image_config=self.image_config, algorithm_name=self.naming, @@ -2687,6 +2709,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH") self._plot_feature_importance( X_train=GradientBoostingRegression.X_train, + name_column=RegressionWorkflowBase.name_train, trained_model=self.auto_model, image_config=self.image_config, algorithm_name=self.naming, @@ -2884,6 +2907,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2893,6 +2917,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2903,6 +2928,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2911,6 +2937,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2918,6 +2945,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, y_test_predict=LassoRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -2928,6 +2956,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2935,6 +2964,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, y_test_predict=LassoRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -2965,6 +2995,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2974,6 +3005,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2984,6 +3016,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2992,6 +3025,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -2999,6 +3033,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, y_test_predict=LassoRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3009,6 +3044,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3016,6 +3052,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=LassoRegression.X_test, target_data=LassoRegression.y_test, + data_name=LassoRegression.name_test, y_test_predict=LassoRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3214,6 +3251,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3223,6 +3261,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3233,6 +3272,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3241,6 +3281,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3248,6 +3289,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, y_test_predict=ElasticNetRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3258,6 +3300,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3265,6 +3308,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, y_test_predict=ElasticNetRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3295,6 +3339,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3304,6 +3349,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3314,6 +3360,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3322,6 +3369,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3329,6 +3377,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, y_test_predict=ElasticNetRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3339,6 +3388,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3346,6 +3396,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=ElasticNetRegression.X_test, target_data=ElasticNetRegression.y_test, + data_name=ElasticNetRegression.name_test, y_test_predict=ElasticNetRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -3648,6 +3699,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3657,6 +3709,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3667,6 +3720,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3675,6 +3729,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3683,6 +3738,7 @@ def special_components(self, **kwargs) -> None: feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3692,6 +3748,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3700,6 +3757,7 @@ def special_components(self, **kwargs) -> None: feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3729,6 +3787,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3738,6 +3797,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3748,6 +3808,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3756,6 +3817,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3764,6 +3826,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3773,6 +3836,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -3781,6 +3845,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: feature_data=SGDRegression.X_test, target_data=SGDRegression.y_test, y_test_predict=SGDRegression.y_test_predict, + data_name=SGDRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4184,6 +4249,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4193,6 +4259,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4203,6 +4270,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4211,6 +4279,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4218,6 +4287,7 @@ def special_components(self, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, y_test_predict=RidgeRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -4228,6 +4298,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4235,6 +4306,7 @@ def special_components(self, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, y_test_predict=RidgeRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -4265,6 +4337,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4274,6 +4347,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=three_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4284,6 +4358,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=two_dimen_data, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4292,6 +4367,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_scatter_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4299,6 +4375,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_3d_surface_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, y_test_predict=RidgeRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, @@ -4309,6 +4386,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_scatter_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH, @@ -4316,6 +4394,7 @@ def special_components(self, is_automl: bool, **kwargs) -> None: self._plot_2d_line_diagram( feature_data=RidgeRegression.X_test, target_data=RidgeRegression.y_test, + data_name=RidgeRegression.name_test, y_test_predict=RidgeRegression.y_test_predict, algorithm_name=self.naming, local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH, diff --git a/geochemistrypi/data_mining/plot/map_plot.py b/geochemistrypi/data_mining/plot/map_plot.py index a7275c98..fef1c4bc 100644 --- a/geochemistrypi/data_mining/plot/map_plot.py +++ b/geochemistrypi/data_mining/plot/map_plot.py @@ -14,7 +14,7 @@ logging.captureWarnings(True) -def map_projected_by_cartopy(col: pd.Series, longitude: pd.DataFrame, latitude: pd.DataFrame) -> None: +def map_projected_by_cartopy(col: pd.Series, name_column: str, longitude: pd.DataFrame, latitude: pd.DataFrame) -> None: """Project an element data into world map using cartopy. Parameters @@ -78,10 +78,10 @@ def map_projected_by_cartopy(col: pd.Series, longitude: pd.DataFrame, latitude: # save figure and data data = pd.concat([col, longitude, latitude], axis=1) save_fig(f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) - save_data(data, f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) + save_data(data, name_column, f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) -def map_projected_by_basemap(col: pd.Series, longitude: pd.DataFrame, latitude: pd.DataFrame) -> None: +def map_projected_by_basemap(col: pd.Series, name_column: str, longitude: pd.DataFrame, latitude: pd.DataFrame) -> None: """Project an element data into world map using basemap. Parameters @@ -125,10 +125,10 @@ def map_projected_by_basemap(col: pd.Series, longitude: pd.DataFrame, latitude: data = pd.concat([col, longitude, latitude], axis=1) save_fig(f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) - save_data(data, f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) + save_data(data, name_column, f"Map Projection - {col.name}", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MAP_PATH"), MLFLOW_ARTIFACT_IMAGE_MAP_PATH) -def process_world_map(data: pd.DataFrame) -> None: +def process_world_map(data: pd.DataFrame, name_column: str) -> None: """The process of projecting the data on the world map.""" map_flag = 0 is_map_projection = 0 @@ -172,9 +172,9 @@ def process_world_map(data: pd.DataFrame) -> None: # If OS is macOS, then use cartopy to project the data on the world map. my_os = get_os() if my_os == "Windows" or my_os == "Linux": - map_projected_by_basemap(data.iloc[:, elm_num - 1], longitude, latitude) + map_projected_by_basemap(data.iloc[:, elm_num - 1], name_column, longitude, latitude) elif my_os == "macOS": - map_projected_by_cartopy(data.iloc[:, elm_num - 1], longitude, latitude) + map_projected_by_cartopy(data.iloc[:, elm_num - 1], name_column, longitude, latitude) clear_output() print("Do you want to continue to project a new element in the World Map?") num2option(OPTION) diff --git a/geochemistrypi/data_mining/plot/statistic_plot.py b/geochemistrypi/data_mining/plot/statistic_plot.py index ba60625b..79d3f6e3 100644 --- a/geochemistrypi/data_mining/plot/statistic_plot.py +++ b/geochemistrypi/data_mining/plot/statistic_plot.py @@ -72,7 +72,7 @@ def ratio_null_vs_filled(data: pd.DataFrame) -> None: print("--" * 10) -def correlation_plot(col: pd.Index, df: pd.DataFrame) -> None: +def correlation_plot(col: pd.Index, df: pd.DataFrame, name_column: str) -> None: """A heatmap describing the correlation between the required columns. Parameters @@ -89,10 +89,10 @@ def correlation_plot(col: pd.Index, df: pd.DataFrame) -> None: sns.heatmap(plot_df_cor, cmap="coolwarm", annot=True, linewidths=0.5) print("Successfully calculate the pair-wise correlation coefficient among the selected columns.") save_fig("Correlation Plot", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) - save_data(df, "Correlation Plot", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) + save_data(df, name_column, "Correlation Plot", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) -def distribution_plot(col: pd.Index, df: pd.DataFrame) -> None: +def distribution_plot(col: pd.Index, df: pd.DataFrame, name_column: str) -> None: """The histogram containing the respective distribution subplots of the required columns. Parameters @@ -111,10 +111,10 @@ def distribution_plot(col: pd.Index, df: pd.DataFrame) -> None: plt.title(col[i]) print("Successfully draw the distribution plot of the selected columns.") save_fig("Distribution Histogram", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) - save_data(df, "Distribution Histogram", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) + save_data(df, name_column, "Distribution Histogram", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) -def log_distribution_plot(col: pd.Index, df: pd.DataFrame) -> None: +def log_distribution_plot(col: pd.Index, df: pd.DataFrame, name_column: str) -> None: """The histogram containing the respective distribution subplots after log transformation of the required columns. Parameters @@ -140,7 +140,7 @@ def log_distribution_plot(col: pd.Index, df: pd.DataFrame) -> None: print("Successfully draw the distribution plot after log transformation of the selected columns.") save_fig("Distribution Histogram After Log Transformation", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) - save_data(df_log_transformed, "Distribution Histogram After Log Transformation", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) + save_data(df_log_transformed, name_column, "Distribution Histogram After Log Transformation", os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_STATISTIC_PATH"), MLFLOW_ARTIFACT_IMAGE_STATISTIC_PATH) def probability_plot(col: pd.Index, df_origin: pd.DataFrame, df_impute: pd.DataFrame) -> None: diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py index 324cb591..07ab6b98 100644 --- a/geochemistrypi/data_mining/process/classify.py +++ b/geochemistrypi/data_mining/process/classify.py @@ -29,7 +29,7 @@ def __init__(self, model_name: str) -> None: self.clf_workflow = ClassificationWorkflowBase() self.transformer_config = {} - @dispatch(object, object, object, object, object, object) + @dispatch(object, object, object, object, object, object, object, object, object) def activate( self, X: pd.DataFrame, @@ -38,17 +38,20 @@ def activate( X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, + name_train: pd.Series, + name_test: pd.Series, + name_all: pd.Series, ) -> None: """Train by Scikit-learn framework.""" # Load the required data into the base class's attributes - self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Customize label - y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) + y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Sample balance - sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) + sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Model option if self.model_name == "Support Vector Machine": @@ -171,9 +174,13 @@ def activate( self.clf_workflow.fit(X_train, y_train) y_train_predict = self.clf_workflow.predict(X_train) y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns) + y_train_predict = y_train_predict.dropna() + y_train_predict = y_train_predict.reset_index(drop=True) self.clf_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.clf_workflow.predict(X_test) y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns) + y_test_predict = y_test_predict.dropna() + y_test_predict = y_test_predict.reset_index(drop=True) self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict) # Save the model hyper-parameters @@ -186,13 +193,13 @@ def activate( self.clf_workflow.special_components() # Save the prediction result - self.clf_workflow.data_save(y_train_predict, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") - self.clf_workflow.data_save(y_test_predict, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") + self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") + self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.clf_workflow.model_save() - @dispatch(object, object, object, object, object, object, bool) + @dispatch(object, object, object, object, object, object, object, object, object, bool) def activate( self, X: pd.DataFrame, @@ -201,17 +208,20 @@ def activate( X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, + name_train: pd.Series, + name_test: pd.Series, + name_all: pd.Series, is_automl: bool, ) -> None: """Train by FLAML framework + RAY framework.""" - self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Customize label - y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) + y, y_train, y_test = self.clf_workflow.customize_label(y, y_train, y_test, name_all, name_train, name_test, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Sample balance - sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) + sample_balance_config, X_train, y_train = self.clf_workflow.sample_balance(X_train, y_train, name_train, os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH) # Model option if self.model_name == "Support Vector Machine": @@ -241,9 +251,13 @@ def activate( self.clf_workflow.fit(X_train, y_train, is_automl) y_train_predict = self.clf_workflow.predict(X_train, is_automl) y_train_predict = self.clf_workflow.np2pd(y_train_predict, y_train.columns) + y_train_predict = y_train_predict.dropna() + y_train_predict = y_train_predict.reset_index(drop=True) self.clf_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.clf_workflow.predict(X_test, is_automl) y_test_predict = self.clf_workflow.np2pd(y_test_predict, y_test.columns) + y_test_predict = y_test_predict.dropna() + y_test_predict = y_test_predict.reset_index(drop=True) self.clf_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_test_predict=y_test_predict) # Save the model hyper-parameters @@ -259,8 +273,8 @@ def activate( self.clf_workflow.special_components(is_automl) # Save the prediction result - self.clf_workflow.data_save(y_train_predict, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") - self.clf_workflow.data_save(y_test_predict, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") + self.clf_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") + self.clf_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.clf_workflow.model_save(is_automl) diff --git a/geochemistrypi/data_mining/process/cluster.py b/geochemistrypi/data_mining/process/cluster.py index 1f1663a8..5c5bdb39 100644 --- a/geochemistrypi/data_mining/process/cluster.py +++ b/geochemistrypi/data_mining/process/cluster.py @@ -24,10 +24,13 @@ def activate( X_test: Optional[pd.DataFrame] = None, y_train: Optional[pd.DataFrame] = None, y_test: Optional[pd.DataFrame] = None, + name_train: Optional[pd.Series] = None, + name_test: Optional[pd.Series] = None, + name_all: Optional[pd.Series] = None, ) -> None: """Train by Scikit-learn framework.""" - self.clt_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.clt_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_all=name_all) if self.model_name == "KMeans": hyper_parameters = KMeansClustering.manual_hyper_parameters() diff --git a/geochemistrypi/data_mining/process/decompose.py b/geochemistrypi/data_mining/process/decompose.py index 91cb1a17..a5128190 100644 --- a/geochemistrypi/data_mining/process/decompose.py +++ b/geochemistrypi/data_mining/process/decompose.py @@ -25,10 +25,13 @@ def activate( X_test: Optional[pd.DataFrame] = None, y_train: Optional[pd.DataFrame] = None, y_test: Optional[pd.DataFrame] = None, + name_train: Optional[pd.Series] = None, + name_test: Optional[pd.Series] = None, + name_all: Optional[pd.Series] = None, ) -> None: """Train by Scikit-learn framework.""" - self.dcp_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.dcp_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_all=name_all) if self.model_name == "PCA": hyper_parameters = PCADecomposition.manual_hyper_parameters() @@ -68,7 +71,7 @@ def activate( self.dcp_workflow.special_components(components_num=hyper_parameters["n_components"], reduced_data=X_reduced) # Save decomposition result - self.dcp_workflow.data_save(X_reduced, "X Reduced", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Reduced Data") + self.dcp_workflow.data_save(X_reduced, name_all, "X Reduced", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Reduced Data") # Save the trained model self.dcp_workflow.model_save() diff --git a/geochemistrypi/data_mining/process/detect.py b/geochemistrypi/data_mining/process/detect.py index d2141c14..35e87915 100644 --- a/geochemistrypi/data_mining/process/detect.py +++ b/geochemistrypi/data_mining/process/detect.py @@ -24,10 +24,13 @@ def activate( X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, + name_train: pd.Series, + name_test: pd.Series, + name_all: pd.Series, ) -> None: """Train by Scikit-learn framework.""" - self.ad_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.ad_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Model option if self.model_name == "Isolation Forest": @@ -55,9 +58,9 @@ def activate( # Use Scikit-learn style API to process input data self.ad_workflow.fit(X) y_predict = self.ad_workflow.predict(X) - X_anomaly_detection, X_normal, X_anomaly = self.ad_workflow._detect_data(X, y_predict) + X_anomaly_detection, X_normal, X_abnormal, name_normal, name_abnormal = self.ad_workflow._detect_data(X, name_all, y_predict) self.ad_workflow.anomaly_detection_result = X_anomaly_detection - self.ad_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.ad_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test, name_all=name_all) # Save the model hyper-parameters self.ad_workflow.save_hyper_parameters(hyper_parameters, self.model_name, os.getenv("GEOPI_OUTPUT_PARAMETERS_PATH")) @@ -68,10 +71,10 @@ def activate( # special components of different algorithms self.ad_workflow.special_components() - # Save anomaly detection result - self.ad_workflow.data_save(X_anomaly_detection, "X Anomaly Detection", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Anomaly Detection Data") - self.ad_workflow.data_save(X_normal, "X Normal", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Normal Data") - self.ad_workflow.data_save(X_anomaly, "X Anomaly", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Anomaly Data") + # Save abnormal detection result + self.ad_workflow.data_save(X_anomaly_detection, name_all, "X Abnormal Detection", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Abnormal Detection Data") + self.ad_workflow.data_save(X_normal, name_normal, "X Normal", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Normal Data") + self.ad_workflow.data_save(X_abnormal, name_abnormal, "X Abnormal", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Abnormal Data") # Save the trained model self.ad_workflow.model_save() diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py index 2ee78251..01157d75 100644 --- a/geochemistrypi/data_mining/process/regress.py +++ b/geochemistrypi/data_mining/process/regress.py @@ -36,7 +36,7 @@ def __init__(self, model_name: str) -> None: self.reg_workflow = RegressionWorkflowBase() self.transformer_config = {} - @dispatch(object, object, object, object, object, object) + @dispatch(object, object, object, object, object, object, object, object, object) def activate( self, X: pd.DataFrame, @@ -45,11 +45,13 @@ def activate( X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, + name_train: pd.Series, + name_test: pd.Series, + name_all: pd.Series, ) -> None: """Train by Scikit-learn framework.""" - self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) - + self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test) # Model option if self.model_name == "Polynomial Regression": hyper_parameters = PolynomialRegression.manual_hyper_parameters() @@ -213,9 +215,13 @@ def activate( self.reg_workflow.fit(X_train, y_train) y_train_predict = self.reg_workflow.predict(X_train) y_train_predict = self.reg_workflow.np2pd(y_train_predict, y_train.columns) + y_train_predict = y_train_predict.dropna() + y_train_predict = y_train_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.reg_workflow.predict(X_test) y_test_predict = self.reg_workflow.np2pd(y_test_predict, y_test.columns) + y_test_predict = y_test_predict.dropna() + y_test_predict = y_test_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_test_predict=y_test_predict) # Save the model hyper-parameters @@ -228,13 +234,13 @@ def activate( self.reg_workflow.special_components() # Save the prediction result - self.reg_workflow.data_save(y_train_predict, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") - self.reg_workflow.data_save(y_test_predict, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") + self.reg_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") + self.reg_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.reg_workflow.model_save() - @dispatch(object, object, object, object, object, object, bool) + @dispatch(object, object, object, object, object, object, object, object, object, bool) def activate( self, X: pd.DataFrame, @@ -243,11 +249,14 @@ def activate( X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, + name_train: pd.Series, + name_test: pd.Series, + name_all: pd.Series, is_automl: bool, ) -> None: """Train by FLAML framework + RAY framework.""" - self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) + self.reg_workflow.data_upload(X=X, y=y, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, name_train=name_train, name_test=name_test) # Model option if self.model_name == "Polynomial Regression": @@ -292,9 +301,13 @@ def activate( self.reg_workflow.fit(X_train, y_train, is_automl) y_train_predict = self.reg_workflow.predict(X_train, is_automl) y_train_predict = self.reg_workflow.np2pd(y_train_predict, y_train.columns) + y_train_predict = y_train_predict.dropna() + y_train_predict = y_train_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_train_predict=y_train_predict) y_test_predict = self.reg_workflow.predict(X_test, is_automl) y_test_predict = self.reg_workflow.np2pd(y_test_predict, y_test.columns) + y_test_predict = y_test_predict.dropna() + y_test_predict = y_test_predict.reset_index(drop=True) self.reg_workflow.data_upload(y_test_predict=y_test_predict) # Save the model hyper-parameters @@ -310,8 +323,8 @@ def activate( self.reg_workflow.special_components(is_automl) # Save the prediction result - self.reg_workflow.data_save(y_train_predict, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") - self.reg_workflow.data_save(y_test_predict, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") + self.reg_workflow.data_save(y_train_predict, name_train, "Y Train Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Train Prediction") + self.reg_workflow.data_save(y_test_predict, name_test, "Y Test Predict", os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH"), MLFLOW_ARTIFACT_DATA_PATH, "Model Test Prediction") # Save the trained model self.reg_workflow.model_save(is_automl) diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 7f9f0f69..50dd30cd 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -192,7 +192,52 @@ def save_fig(fig_name: str, local_image_path: str, mlflow_artifact_image_path: s mlflow.log_artifact(full_path) -def save_data(df: pd.DataFrame, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: +def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: + """Save the dataset in the local directory and in mlflow specialized directory. + + Parameters + ---------- + df : pd.DataFrame + The dataset to store. + + name_column: + The name of the data. + + df_name : str + The name of the data sheet. + + local_data_path : str + The path to store the data sheet + + mlflow_artifact_data_path : str, default=None + The path to store the data sheet in mlflow. + + index : bool, default=False + Whether to write the index. + """ + if name_column is not None and len(df) == len(name_column): + name_column = name_column.loc[df.index].reset_index(drop=True) + df.reset_index(drop=True, inplace=True) + name_column.reset_index(drop=True, inplace=True) + df = pd.concat([name_column, df], axis=1) + try: + # drop the index in case that the dimensions change + full_path = os.path.join(local_data_path, "{}.xlsx".format(df_name)) + df.to_excel(full_path, index=index) + if mlflow_artifact_data_path: + mlflow.log_artifact(full_path, artifact_path=mlflow_artifact_data_path) + else: + mlflow.log_artifact(full_path) + print(f"Successfully store '{df_name}' in '{df_name}.xlsx' in {local_data_path}.") + except ModuleNotFoundError: + print("** Please download openpyxl by pip3 **") + print("** The data will be stored in .csv file **") + full_path = os.path.join(local_data_path, "{}.csv".format(df_name)) + df.to_csv(full_path, index=index) + print(f"Successfully store '{df_name}' in '{df_name}.csv' in {local_data_path}.") + + +def save_data_special(df: pd.DataFrame, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: """Save the dataset in the local directory and in mlflow specialized directory. Parameters From b040a171d8c02c3f21b48889796a0ebe6d9f1ee0 Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 15 Sep 2024 22:48:21 +0800 Subject: [PATCH 2/2] perf:Add a name to the data --- geochemistrypi/data_mining/model/decomposition.py | 14 +++++++------- geochemistrypi/data_mining/utils/base.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py index 7cf3ba3f..95e60d74 100644 --- a/geochemistrypi/data_mining/model/decomposition.py +++ b/geochemistrypi/data_mining/model/decomposition.py @@ -10,7 +10,7 @@ from sklearn.manifold import MDS, TSNE from ..constants import MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH -from ..utils.base import clear_output, save_data, save_data_special, save_fig +from ..utils.base import clear_output, save_data, save_data_without_data_identifier, save_fig from ._base import WorkflowBase from .func.algo_decomposition._common import plot_2d_scatter_diagram, plot_contour, plot_heatmap from .func.algo_decomposition._enum import DecompositionCommonFunction, PCASpecialFunction @@ -298,18 +298,18 @@ def _biplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, """Draw bi-plot.""" print(f"-----* {graph_name} *-----") biplot(reduced_data, pc_data, algorithm_name) - save_fig(f"Compositional Bi-plot - {algorithm_name}", local_path, mlflow_path) - save_data_special(reduced_data, "Compositional Bi-plot - Reduced Data", local_path, mlflow_path) - save_data_special(pc_data, "Compositional Bi-plot - PC Data", local_path, mlflow_path) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data_without_data_identifier(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path) + save_data_without_data_identifier(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path) @staticmethod def _triplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None: """Draw tri-plot.""" print(f"-----* {graph_name} *-----") triplot(reduced_data, pc_data, algorithm_name) - save_fig(f"Compositional Tri-plot - {algorithm_name}", local_path, mlflow_path) - save_data_special(reduced_data, "Compositional Tri-plot - Reduced Data", local_path, mlflow_path) - save_data_special(pc_data, "Compositional Tri-plot - PC Data", local_path, mlflow_path) + save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path) + save_data_without_data_identifier(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path) + save_data_without_data_identifier(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path) def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None: """Invoke all special application functions for this algorithms by Scikit-learn framework.""" diff --git a/geochemistrypi/data_mining/utils/base.py b/geochemistrypi/data_mining/utils/base.py index 50dd30cd..84b3965f 100644 --- a/geochemistrypi/data_mining/utils/base.py +++ b/geochemistrypi/data_mining/utils/base.py @@ -237,7 +237,7 @@ def save_data(df: pd.DataFrame, name_column: str, df_name: str, local_data_path: print(f"Successfully store '{df_name}' in '{df_name}.csv' in {local_data_path}.") -def save_data_special(df: pd.DataFrame, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: +def save_data_without_data_identifier(df: pd.DataFrame, df_name: str, local_data_path: str, mlflow_artifact_data_path: str = None, index: bool = False) -> None: """Save the dataset in the local directory and in mlflow specialized directory. Parameters