ZJUEarthData · SanyHe · Jul 24, 2023 · Jul 24, 2023
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 
 # all produced data set under the directory output
 **/output
+**/geopi_output
 
 # all saved trained model under the directory trained model
 **/trained_models

diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -10,7 +10,6 @@
 from .constants import (
     CLASSIFICATION_MODELS,
     CLUSTERING_MODELS,
-    DATASET_OUTPUT_PATH,
     DECOMPOSITION_MODELS,
     FEATURE_SCALING_STRATEGY,
     GEO_IMAGE_PATH,
@@ -19,7 +18,6 @@
     MLFLOW_ARTIFACT_DATA_PATH,
     MODE_OPTION,
     MODEL_OUTPUT_IMAGE_PATH,
-    MODEL_PATH,
     NON_AUTOML_MODELS,
     OPTION,
     OUTPUT_PATH,
@@ -40,16 +38,14 @@
 from .process.cluster import ClusteringModelSelection
 from .process.decompose import DecompositionModelSelection
 from .process.regress import RegressionModelSelection
-from .utils.base import clear_output, log, save_data, show_warning
+from .utils.base import clear_output, create_geopi_output_dir, log, save_data, show_warning
 from .utils.mlflow_utils import retrieve_previous_experiment_id
 
 # create the directories if they didn't exist yet
 os.makedirs(MODEL_OUTPUT_IMAGE_PATH, exist_ok=True)
 os.makedirs(STATISTIC_IMAGE_PATH, exist_ok=True)
-os.makedirs(DATASET_OUTPUT_PATH, exist_ok=True)
 os.makedirs(MAP_IMAGE_PATH, exist_ok=True)
 os.makedirs(GEO_IMAGE_PATH, exist_ok=True)
-os.makedirs(MODEL_PATH, exist_ok=True)
 
 
 def cli_pipeline(file_name: str) -> None:
@@ -110,6 +106,7 @@ def cli_pipeline(file_name: str) -> None:
     run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
     run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
     mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
+    create_geopi_output_dir(experiment.name, run_name)
     clear_output()
 
     # <--- Built-in Data Loading --->
@@ -140,44 +137,46 @@ def cli_pipeline(file_name: str) -> None:
     logger.debug("Data Selection")
     print("-*-*- Data Selection -*-*-")
     show_data_columns(data.columns)
-    data_processed = create_sub_data_set(data)
+    data_selected = create_sub_data_set(data)
     clear_output()
     print("The Selected Data Set:")
-    print(data_processed)
+    print(data_selected)
     clear_output()
     print("Basic Statistical Information: ")
-    basic_info(data_processed)
-    basic_statistic(data_processed)
-    correlation_plot(data_processed.columns, data_processed)
-    distribution_plot(data_processed.columns, data_processed)
-    logged_distribution_plot(data_processed.columns, data_processed)
-    save_data(data_processed, "Data Selected", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+    basic_info(data_selected)
+    basic_statistic(data_selected)
+    correlation_plot(data_selected.columns, data_selected)
+    distribution_plot(data_selected.columns, data_selected)
+    logged_distribution_plot(data_selected.columns, data_selected)
+    GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
+    save_data(data, "Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+    save_data(data_selected, "Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
     clear_output()
 
     # <--- Imputation --->
     logger.debug("Imputation")
     print("-*-*- Imputation -*-*-")
-    is_null_value(data_processed)
-    ratio_null_vs_filled(data_processed)
-    imputed_flag = is_imputed(data_processed)
+    is_null_value(data_selected)
+    ratio_null_vs_filled(data_selected)
+    imputed_flag = is_imputed(data_selected)
     clear_output()
     if imputed_flag:
         print("-*-*- Strategy for Missing Values -*-*-")
         num2option(IMPUTING_STRATEGY)
         print("Which strategy do you want to apply?")
         strategy_num = limit_num_input(IMPUTING_STRATEGY, SECTION[1], num_input)
-        data_processed_imputed_np = imputer(data_processed, IMPUTING_STRATEGY[strategy_num - 1])
-        data_processed_imputed = np2pd(data_processed_imputed_np, data_processed.columns)
-        del data_processed_imputed_np
+        data_selected_imputed_np = imputer(data_selected, IMPUTING_STRATEGY[strategy_num - 1])
+        data_selected_imputed = np2pd(data_selected_imputed_np, data_selected.columns)
+        del data_selected_imputed_np
         clear_output()
         print("-*-*- Hypothesis Testing on Imputation Method -*-*-")
         print("Null Hypothesis: The distributions of the data set before and after imputing remain the same.")
         print("Thoughts: Check which column rejects null hypothesis.")
         print("Statistics Test Method: Wilcoxon Test")
         monte_carlo_simulator(
-            data_processed,
-            data_processed_imputed,
-            sample_size=data_processed_imputed.shape[0] // 2,
+            data_selected,
+            data_selected_imputed,
+            sample_size=data_selected_imputed.shape[0] // 2,
             iteration=100,
             test="wilcoxon",
             confidence=0.05,
@@ -186,20 +185,22 @@ def cli_pipeline(file_name: str) -> None:
         # print("The statistics test method: Kruskal Wallis Test")
         # monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
         #                       iteration=100, test='kruskal', confidence=0.05)
-        probability_plot(data_processed.columns, data_processed, data_processed_imputed)
-        basic_info(data_processed_imputed)
-        basic_statistic(data_processed_imputed)
-        del data_processed
+        probability_plot(data_selected.columns, data_selected, data_selected_imputed)
+        basic_info(data_selected_imputed)
+        basic_statistic(data_selected_imputed)
+        save_data(data_selected_imputed, "Data Selected Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        del data_selected
         clear_output()
     else:
         # if the selected data set doesn't need imputation, which means there are no missing values.
-        data_processed_imputed = data_processed
+        data_selected_imputed = data_selected
 
     # <--- Feature Engineering --->
     logger.debug("Feature Engineering")
-    feature_built = FeatureConstructor(data_processed_imputed)
+    feature_built = FeatureConstructor(data_selected_imputed)
     feature_built.process_feature_engineering()
-    data_processed_imputed = feature_built.data
+    data_selected_imputed_fe = feature_built.data
+    del data_selected_imputed
 
     # <--- Mode Selection --->
     logger.debug("Mode Selection")
@@ -216,15 +217,15 @@ def cli_pipeline(file_name: str) -> None:
         print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
         # create X data set
         print("Selected sub data set to create X data set:")
-        show_data_columns(data_processed_imputed.columns)
+        show_data_columns(data_selected_imputed_fe.columns)
         print("The selected X data set:")
-        X = create_sub_data_set(data_processed_imputed)
+        X = create_sub_data_set(data_selected_imputed_fe)
         print("Successfully create X data set.")
         print("The Selected Data Set:")
         print(X)
         print("Basic Statistical Information: ")
         basic_statistic(X)
-        save_data(X, "X Without Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
         clear_output()
 
         # <--- Feature Scaling --->
@@ -242,23 +243,23 @@ def cli_pipeline(file_name: str) -> None:
             print(X)
             print("Basic Statistical Information: ")
             basic_statistic(X)
-            save_data(X, "X With Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+            save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
         clear_output()
 
         # create Y data set
         print("-*-*- Data Split - X Set and Y Set-*-*-")
         print("Selected sub data set to create Y data set:")
-        show_data_columns(data_processed_imputed.columns)
+        show_data_columns(data_selected_imputed_fe.columns)
         print("The selected Y data set:")
         print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
         print("Notice: For classification model training, please choose the label column which has distinctive integers.")
-        y = create_sub_data_set(data_processed_imputed)
+        y = create_sub_data_set(data_selected_imputed_fe)
         print("Successfully create Y data set.")
         print("The Selected Data Set:")
         print(y)
         print("Basic Statistical Information: ")
         basic_statistic(y)
-        save_data(y, "y", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
         clear_output()
 
         # create training data and testing data
@@ -272,15 +273,15 @@ def cli_pipeline(file_name: str) -> None:
             print(value)
             print(f"Basic Statistical Information: {key}")
             basic_statistic(value)
-            save_data(value, key, DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
-        X_train, X_test = train_test_data["X train"], train_test_data["X test"]
-        y_train, y_test = train_test_data["y train"], train_test_data["y test"]
-        del data_processed_imputed
+            save_data(value, key, GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        X_train, X_test = train_test_data["X Train"], train_test_data["X Test"]
+        y_train, y_test = train_test_data["Y Train"], train_test_data["Y Test"]
+        del data_selected_imputed_fe
         clear_output()
     else:
         # unsupervised learning
-        X = data_processed_imputed
-        X_train = data_processed_imputed
+        X = data_selected_imputed_fe
+        X_train = data_selected_imputed_fe
         y, X_test, y_train, y_test = None, None, None, None
 
     # <--- Model Selection --->

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -13,7 +13,7 @@
 BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")
 
 # the root directory where all the output stays
-OUTPUT_PATH = os.path.join(WORKING_PATH, "output")
+OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")
 
 # the directory where the data set produced stays
 DATASET_OUTPUT_PATH = os.path.join(OUTPUT_PATH, "data")

diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py
@@ -1,6 +1,6 @@
 import os
 import sys
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import openpyxl.utils.exceptions
 import pandas as pd
@@ -154,10 +154,27 @@ def create_sub_data_set(data: pd.DataFrame) -> pd.DataFrame:
     return sub_data_set
 
 
-def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> dict:
-    """Split arrays or matrices into random train and test subsets."""
+def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> Dict:
+    """Split arrays or matrices into random train and test subsets.
+
+    Parameters
+    ----------
+    X : pd.DataFrame
+        The data to be split.
+
+    y : pd.DataFrame or pd.Series
+        The target variable to be split.
+
+    test_size : float, default=0.2
+        Represents the proportion of the dataset to include in the test split.
+
+    Returns
+    -------
+    dict
+        A dictionary containing the split data.
+    """
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
-    return {"X train": X_train, "X test": X_test, "y train": y_train, "y test": y_test}
+    return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test}
 
 
 def num2option(items: List[str]) -> None:

diff --git a/geochemistrypi/data_mining/data/feature_engineering.py b/geochemistrypi/data_mining/data/feature_engineering.py
@@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
+import os
 import string
 
 import numpy as np
 import pandas as pd
 from rich import print
 
-from ..constants import DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
+from ..constants import MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
 from ..plot.statistic_plot import basic_statistic
 from ..utils.base import clear_output, save_data
 from .data_readiness import basic_info, limit_num_input, num2option, num_input, show_data_columns
@@ -19,37 +20,34 @@ class FeatureConstructor(object):
     alphabet = string.ascii_letters
     cal_words = ["pow", "sin", "cos", "tan", "pi", "mean", "std", "var", "log"]
 
-    def __init__(self, data):
+    def __init__(self, data: pd.DataFrame) -> None:
         self.feature_name = None
         self.data = data
         self._infix_expr = []
         self._postfix_expr = []
         self.map_dict = {}
         self._result = None
 
-    def index2name(self):
-        """Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name
-
-        :return: index : column name, dict
-        """
+    def index2name(self) -> None:
+        """Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name."""
         columns_name = self.data.columns
         print("Selected data set:")
         for i in range(len(columns_name)):
             print(FeatureConstructor.alphabet[i] + " - " + columns_name[i])
             self.map_dict[FeatureConstructor.alphabet[i]] = columns_name[i]
 
-    def _get_column(self, index):
+    def _get_column(self, index: str) -> str:
         return self.map_dict[index]
 
-    def name_feature(self):
+    def name_feature(self) -> None:
         while True:
             self.feature_name = input("Name the constructed feature (column name), like 'NEW-COMPOUND': \n" "@input: ")
             if len(self.feature_name) == 0:
                 print("Sorry!You haven't named it yet!")
             else:
                 break
 
-    def input_expression(self):
+    def input_expression(self) -> None:
         expression = input(
             "Build up new feature with the combination of 4 basic arithmatic operator,"
             " including '+', '-', '*', '/', '()'.\n"
@@ -103,7 +101,8 @@ def input_expression(self):
             else:
                 break
 
-    def evaluate(self):
+    def evaluate(self) -> None:
+        """Evaluate the expression."""
         self.letter_map()
         np.array(["dummy"])  # dummy array to skip the flake8 warning - F401 'numpy as np' imported but unused'
         self._infix_expr = self._infix_expr.replace("sin", "np.sin")
@@ -127,7 +126,8 @@ def evaluate(self):
         except ZeroDivisionError:
             print("The expression contains a division by zero.")
 
-    def letter_map(self):
+    def letter_map(self) -> None:
+        """Map the letter to the column name."""
         new_text = ""
         test_text = "".join(ch for ch in self._infix_expr if ch not in set(" "))
         for words in FeatureConstructor.cal_words:
@@ -147,12 +147,14 @@ def letter_map(self):
                     else:
                         self._infix_expr += ww
 
-    def process_feature_engineering(self):
+    def process_feature_engineering(self) -> None:
+        """Process the feature engineering."""
         print("-*-*- Feature Engineering -*-*-")
         print("The Selected Data Set:")
         show_data_columns(self.data.columns)
         fe_flag = 0
         is_feature_engineering = 0
+        GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
         while True:
             if fe_flag != 1:
                 print("Feature Engineering Option:")
@@ -179,20 +181,17 @@ def process_feature_engineering(self):
                     clear_output()
                     continue
                 else:
-                    save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                    save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
                     print("Exit Feature Engineering Mode.")
                     clear_output()
                     break
             else:
-                save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+                save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
                 clear_output()
                 break
 
-    def create_data_set(self):
+    def create_data_set(self) -> pd.DataFrame:
+        """Create a new data set with the new feature."""
         print(f'Successfully construct a new feature "{self.feature_name}".')
         print(self._result)
         return pd.concat([self.data, self._result], axis=1)
-
-    # TODO: Is the scope of input right?
-    def check_data_scope(self):
-        pass