Merge pull request #383 from ZJUEarthData/dev/Jin

feat:Added the function of switching between APP mode and standard us…
ZJUEarthData · Sep 15, 2024 · 15f18b4 · 15f18b4
2 parents 817f843 + 8d15ef4
commit 15f18b4
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 2 deletions.
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -30,9 +30,10 @@
     REGRESSION_MODELS_WITH_MISSING_VALUES,
     SECTION,
     TEST_DATA_OPTION,
+    TOGGLE_ADDRESS_STATUS,
     WORKING_PATH,
 )
-from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
+from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns, show_excel_columns
 from .data.feature_engineering import FeatureConstructor
 from .data.imputation import imputer
 from .data.inference import build_transform_pipeline, model_inference
@@ -47,6 +48,7 @@
 from .process.regress import RegressionModelSelection
 from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
 from .utils.mlflow_utils import retrieve_previous_experiment_id
+from .utils.toggle_address_status import toggle_address_status
 
 
 def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None:
@@ -85,6 +87,18 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
     # <-- User Training Data Loading -->
     with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"):
         sleep(0.75)
+
+    # Call toggle_address_status and pass status and training_data_path as parameters to obtain the address of the training data
+    training_data_path = toggle_address_status(status=TOGGLE_ADDRESS_STATUS, training_data_path=training_data_path)[0]
+
+    # Check if the length of training_data_path is greater than 1
+    if len(training_data_path) > 1:
+        # Display the columns of the Excel file located at training_data_path
+        show_excel_columns(training_data_path)
+        print("Please select only one file that you want to process:")
+        # Limit the user input to a number within the range of available files and assign the result to training_data_path
+        training_data_path = training_data_path[limit_num_input(range(1, len(training_data_path) + 1), SECTION[0], num_input) - 1]
+
     if training_data_path:
         # If the user provides file name, then load the training data from the file.
         data = read_data(file_path=training_data_path, is_own_data=1)

diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -1,5 +1,10 @@
 import os
 
+from .utils.toggle_address_status import toggle_address_status
+
+# Adjust the path of project data flow: The number 1 indicates standard mode, and the number 2 indicates APP mode.
+TOGGLE_ADDRESS_STATUS = 1
+
 # The number of uploading dataset per user is limited to 5.
 MAX_UPLOADS_PER_USER = 5
 
@@ -10,7 +15,7 @@
 BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")
 
 # current working directory in which the user activates the application
-WORKING_PATH = os.getcwd()
+WORKING_PATH = toggle_address_status(status=TOGGLE_ADDRESS_STATUS)[1]
 
 # the root directory where all the output stays
 OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py
@@ -74,6 +74,28 @@ def basic_info(data: pd.DataFrame) -> None:
     print(data.info())
 
 
+def show_excel_columns(excel_list: Optional[List] = None) -> None:
+    """Displays the index and name of each column in the provided Excel list.
+
+    Parameters
+    ----------
+    excel_list : Optional[List], optional
+        A list containing the names of Excel columns. Defaults to None.
+
+    Returns
+    -------
+    None
+
+    """
+
+    print("-" * 20)
+
+    print("Index - Excel Name")
+
+    for i in range(len(excel_list)):
+        print(i + 1, "-", excel_list[i])
+
+
 def show_data_columns(columns_name: pd.Index, columns_index: Optional[List] = None) -> None:
     """Show the column names of the data set.
 

diff --git a/geochemistrypi/data_mining/utils/toggle_address_status.py b/geochemistrypi/data_mining/utils/toggle_address_status.py
@@ -0,0 +1,58 @@
+import os
+
+
+def list_excel_files(directory: str) -> list:
+    """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories.
+
+    Parameters
+    ----------
+    directory : str
+        The path to the directory to search for Excel files.
+
+    Returns
+    -------
+    excel_files : list
+        A list of file paths for all Excel files found.
+
+    Notes
+    -----
+    (1) The function uses `os.walk` to traverse the directory and its subdirectories.
+    (2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files.
+    """
+    excel_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"):
+                excel_files.append(os.path.join(root, file))
+    return excel_files
+
+
+def toggle_address_status(status: str = None, training_data_path: str = None) -> list:
+    """Toggles the training data path and output path based on the provided status.
+
+    Parameters
+    ----------
+    status : str, optional
+        The status value, which can be "1" or "2".
+        - "1": Use the input and output paths in command line mode.
+        - "2": Retrieves all Excel files from the "data" folder on the desktop as the training data path, and sets the output path to the desktop.
+    training_data_path : str, optional
+        The path to the training data. This parameter is used when `status` is "1".
+
+    Returns
+    -------
+    paths : list
+        A list containing the training data path and the output path.
+
+    """
+
+    if int(status) == 1:
+        working_path = os.path.dirname(os.getcwd())
+    elif int(status) == 2:
+        desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
+        training_data_path = list_excel_files(os.path.join(desktop_path, "data"))
+        working_path = desktop_path
+    else:
+        raise ValueError("Invalid status value. It should be '1' or '2'.")
+
+    return [training_data_path, working_path]