ZJUEarthData · Jin-sjh · Sep 13, 2024
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -30,9 +30,10 @@
     REGRESSION_MODELS_WITH_MISSING_VALUES,
     SECTION,
     TEST_DATA_OPTION,
+    TOGGLE_ADDRESS_STATUS,
     WORKING_PATH,
 )
-from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
+from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns, show_excel_columns
 from .data.feature_engineering import FeatureConstructor
 from .data.imputation import imputer
 from .data.inference import build_transform_pipeline, model_inference
@@ -47,6 +48,7 @@
 from .process.regress import RegressionModelSelection
 from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
 from .utils.mlflow_utils import retrieve_previous_experiment_id
+from .utils.toggle_address_status import toggle_address_status
 
 
 def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None:
@@ -85,6 +87,14 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
     # <-- User Training Data Loading -->
     with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"):
         sleep(0.75)
+
+    training_data_path = toggle_address_status(status=TOGGLE_ADDRESS_STATUS, training_data_path=training_data_path)[0]
+
+    if len(training_data_path) > 1:
+        show_excel_columns(training_data_path)
+        print("Please select only one file that you want to process:")
+        training_data_path = limit_num_input(range(1, len(training_data_path) + 1), SECTION[0], num_input)
+
     if training_data_path:
         # If the user provides file name, then load the training data from the file.
         data = read_data(file_path=training_data_path, is_own_data=1)

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -1,5 +1,10 @@
 import os
 
+from .utils.toggle_address_status import toggle_address_status
+
+# Adjust the path of project data flow: The number 1 indicates standard mode, and the number 2 indicates APP mode.
+TOGGLE_ADDRESS_STATUS = 1
+
 # The number of uploading dataset per user is limited to 5.
 MAX_UPLOADS_PER_USER = 5
 
@@ -10,7 +15,7 @@
 BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")
 
 # current working directory in which the user activates the application
-WORKING_PATH = os.getcwd()
+WORKING_PATH = toggle_address_status(status=TOGGLE_ADDRESS_STATUS)[1]
 
 # the root directory where all the output stays
 OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py
@@ -74,6 +74,26 @@ def basic_info(data: pd.DataFrame) -> None:
     print(data.info())
 
 
+def show_excel_columns(excel_list: Optional[List] = None) -> None:
+    """Displays the index and name of each column in the provided Excel list.
+
+    Args:
+    excel_list (Optional[List]): A list containing the names of Excel columns. Defaults to None.
+
+    Returns:
+    None
+    """
+    # Print a separator line for visual clarity
+    print("-" * 20)
+
+    # Print the header showing the column index and Excel name
+    print("Index - Excel Name")
+
+    # Iterate over the length of the excel_list to print each index and corresponding Excel name
+    for i in range(len(excel_list)):
+        print(i + 1, "-", excel_list[i])
+
+
 def show_data_columns(columns_name: pd.Index, columns_index: Optional[List] = None) -> None:
     """Show the column names of the data set.
 

diff --git a/geochemistrypi/data_mining/model/decomposition.py b/geochemistrypi/data_mining/model/decomposition.py
@@ -13,7 +13,6 @@
 from ..utils.base import clear_output, save_data, save_fig
 from ._base import WorkflowBase
 from .func.algo_decomposition._common import plot_2d_scatter_diagram, plot_contour, plot_heatmap
-from .func.algo_decomposition._enum import DecompositionCommonFunction, PCASpecialFunction
 from .func.algo_decomposition._mds import mds_manual_hyper_parameters
 from .func.algo_decomposition._pca import biplot, pca_manual_hyper_parameters, triplot
 from .func.algo_decomposition._tsne import tsne_manual_hyper_parameters
@@ -22,7 +21,7 @@
 class DecompositionWorkflowBase(WorkflowBase):
     """The base workflow class of decomposition algorithms."""
 
-    common_function = [func.value for func in DecompositionCommonFunction]  # 'Decomposition Result',
+    common_function = ["Model Persistence"]  # 'Decomposition Result',
 
     def __init__(self) -> None:
         super().__init__()
@@ -118,7 +117,7 @@ class PCADecomposition(DecompositionWorkflowBase):
     """The automation workflow of using PCA algorithm to make insightful products."""
 
     name = "PCA"
-    special_function = [func.value for func in PCASpecialFunction]
+    special_function = ["Principal Components", "Explained Variance Ratio", "Compositional Bi-plot", "Compositional Tri-plot"]
 
     def __init__(
         self,
@@ -269,57 +268,48 @@ def manual_hyper_parameters(cls) -> Dict:
         clear_output()
         return hyper_parameters
 
-    @staticmethod
-    def _get_principal_components(graph_name: str, n_components: Optional[int], trained_model: object) -> None:
+    def _get_principal_components(self) -> None:
         """Get principal components."""
-        print(f"-----* {graph_name} *-----")
+        print("-----* Principal Components *-----")
         print("Every column represents one principal component respectively.")
         print("Every row represents how much that row feature contributes to each principal component respectively.")
         print("The tabular data looks like in format: 'rows x columns = 'features x principal components'.")
         pc_name = []
-        for i in range(n_components):
+        for i in range(self.n_components):
             pc_name.append(f"PC{i+1}")
-        pc_data = pd.DataFrame(trained_model.components_.T)
-        pc_data.columns = pc_name
-        pc_data.set_index(DecompositionWorkflowBase.X.columns, inplace=True)
-        print(pc_data)
+        self.pc_data = pd.DataFrame(self.model.components_.T)
+        self.pc_data.columns = pc_name
+        self.pc_data.set_index(DecompositionWorkflowBase.X.columns, inplace=True)
+        print(self.pc_data)
 
-    @staticmethod
-    def _get_explained_variance_ratio(graph_name: str, trained_model: object) -> None:
+    def _get_explained_variance_ratio(self) -> None:
         """Get explained variance ratio."""
-        print(f"-----* {graph_name} *-----")
-        print(trained_model.explained_variance_ratio_)
+        print("-----* Explained Variance Ratio *-----")
+        print(self.model.explained_variance_ratio_)
 
     @staticmethod
-    def _biplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _biplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Draw bi-plot."""
-        print(f"-----* {graph_name} *-----")
+        print("-----* Compositional Bi-plot *-----")
         biplot(reduced_data, pc_data, algorithm_name)
-        save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
-        save_data(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path)
-        save_data(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path)
+        save_fig(f"Compositional Bi-plot - {algorithm_name}", local_path, mlflow_path)
+        save_data(reduced_data, "Compositional Bi-plot - Reduced Data", local_path, mlflow_path)
+        save_data(pc_data, "Compositional Bi-plot - PC Data", local_path, mlflow_path)
 
     @staticmethod
-    def _triplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
+    def _triplot(reduced_data: pd.DataFrame, pc_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
         """Draw tri-plot."""
-        print(f"-----* {graph_name} *-----")
+        print("-----* Compositional Tri-plot *-----")
         triplot(reduced_data, pc_data, algorithm_name)
-        save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
-        save_data(reduced_data, f"{graph_name} - Reduced Data", local_path, mlflow_path)
-        save_data(pc_data, f"{graph_name} - PC Data", local_path, mlflow_path)
+        save_fig(f"Compositional Tri-plot - {algorithm_name}", local_path, mlflow_path)
+        save_data(reduced_data, "Compositional Tri-plot - Reduced Data", local_path, mlflow_path)
+        save_data(pc_data, "Compositional Tri-plot - PC Data", local_path, mlflow_path)
 
     def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
         """Invoke all special application functions for this algorithms by Scikit-learn framework."""
         self._reduced_data2pd(kwargs["reduced_data"], kwargs["components_num"])
-        self._get_principal_components(
-            graph_name=PCASpecialFunction.PRINCIPAL_COMPONENTS.value,
-            trained_model=self.model,
-            n_components=self.n_components,
-        )
-        self._get_explained_variance_ratio(
-            graph_name=PCASpecialFunction.EXPLAINED_VARIANCE_RATIO.value,
-            trained_model=self.model,
-        )
+        self._get_principal_components()
+        self._get_explained_variance_ratio()
 
         GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH")
         # Draw graphs when the number of principal components > 3
@@ -330,7 +320,6 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
             self._biplot(
                 reduced_data=two_dimen_reduced_data,
                 pc_data=two_dimen_pc_data,
-                graph_name=PCASpecialFunction.COMPOSITIONAL_BI_PLOT.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -342,7 +331,6 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
             self._triplot(
                 reduced_data=three_dimen_reduced_data,
                 pc_data=three_dimen_pc_data,
-                graph_name=PCASpecialFunction.COMPOSITIONAL_TRI_PLOT.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -354,7 +342,6 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
             self._biplot(
                 reduced_data=two_dimen_reduced_data,
                 pc_data=two_dimen_pc_data,
-                graph_name=PCASpecialFunction.COMPOSITIONAL_BI_PLOT.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -363,7 +350,6 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
             self._triplot(
                 reduced_data=self.X_reduced,
                 pc_data=self.pc_data,
-                graph_name=PCASpecialFunction.COMPOSITIONAL_TRI_PLOT.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
@@ -372,7 +358,6 @@ def special_components(self, **kwargs: Union[Dict, np.ndarray, int]) -> None:
             self._biplot(
                 reduced_data=self.X_reduced,
                 pc_data=self.pc_data,
-                graph_name=PCASpecialFunction.COMPOSITIONAL_BI_PLOT.value,
                 algorithm_name=self.naming,
                 local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
                 mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,

diff --git a/geochemistrypi/data_mining/model/func/algo_decomposition/_enum.py b/geochemistrypi/data_mining/model/func/algo_decomposition/_enum.py
diff --git a/geochemistrypi/data_mining/utils/toggle_address_status.py b/geochemistrypi/data_mining/utils/toggle_address_status.py
@@ -0,0 +1,42 @@
+import os
+
+
+def list_excel_files(directory):
+    """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories.
+
+    Parameters:
+    directory (str): The path to the directory to search for Excel files.
+
+    Returns:
+    list: A list of file paths for all Excel files found.
+    """
+    excel_name = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"):
+                excel_name.append(os.path.join(root, file))
+    return excel_name
+
+
+def toggle_address_status(status: str = None, training_data_path: str = None):
+    """Toggles the training data path and output path based on the provided status.
+
+    Args:
+    status (str): The status value, which can be "1" or "2".
+        - "1": Uses the provided `training_data_path` as the training data path, and sets the output path to the parent directory of the current working directory.
+        - "2": Retrieves all Excel files from the "data" folder on the desktop as the training data path, and sets the output path to the desktop.
+
+    Returns:
+    list: A list containing the training data path and the output path.
+    """
+
+    if int(status) == 1:
+        training_data_path = training_data_path
+        working_path = os.path.dirname(os.getcwd())
+    elif int(status) == 2:
+        desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
+        training_data_path = list_excel_files(os.path.join(desktop_path, "data"))
+        working_path = desktop_path
+    else:
+        pass
+    return [training_data_path, working_path]