diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index e3c147dd..47796b30 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -30,9 +30,10 @@ REGRESSION_MODELS_WITH_MISSING_VALUES, SECTION, TEST_DATA_OPTION, + TOGGLE_ADDRESS_STATUS, WORKING_PATH, ) -from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns +from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns, show_excel_columns from .data.feature_engineering import FeatureConstructor from .data.imputation import imputer from .data.inference import build_transform_pipeline, model_inference @@ -47,6 +48,7 @@ from .process.regress import RegressionModelSelection from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning from .utils.mlflow_utils import retrieve_previous_experiment_id +from .utils.toggle_address_status import toggle_address_status def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None: @@ -85,6 +87,18 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = # <-- User Training Data Loading --> with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"): sleep(0.75) + + # Call toggle_address_status and pass status and training_data_path as parameters to obtain the address of the training data + training_data_path = toggle_address_status(status=TOGGLE_ADDRESS_STATUS, training_data_path=training_data_path)[0] + + # Check if the length of training_data_path is greater than 1 + if len(training_data_path) > 1: + # Display the columns of the Excel file located at training_data_path + show_excel_columns(training_data_path) + print("Please select only one file that you want to process:") + # Limit the user input to a number within the range of available files and assign the result to training_data_path + training_data_path = training_data_path[limit_num_input(range(1, len(training_data_path) + 1), SECTION[0], num_input) - 1] + if training_data_path: # If the user provides file name, then load the training data from the file. data = read_data(file_path=training_data_path, is_own_data=1) diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 4eb42641..0dde2f32 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -1,5 +1,10 @@ import os +from .utils.toggle_address_status import toggle_address_status + +# Adjust the path of project data flow: The number 1 indicates standard mode, and the number 2 indicates APP mode. +TOGGLE_ADDRESS_STATUS = 1 + # The number of uploading dataset per user is limited to 5. MAX_UPLOADS_PER_USER = 5 @@ -10,7 +15,7 @@ BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset") # current working directory in which the user activates the application -WORKING_PATH = os.getcwd() +WORKING_PATH = toggle_address_status(status=TOGGLE_ADDRESS_STATUS)[1] # the root directory where all the output stays OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output") diff --git a/geochemistrypi/data_mining/data/data_readiness.py b/geochemistrypi/data_mining/data/data_readiness.py index 6ea0e170..e8fcb967 100644 --- a/geochemistrypi/data_mining/data/data_readiness.py +++ b/geochemistrypi/data_mining/data/data_readiness.py @@ -74,6 +74,28 @@ def basic_info(data: pd.DataFrame) -> None: print(data.info()) +def show_excel_columns(excel_list: Optional[List] = None) -> None: + """Displays the index and name of each column in the provided Excel list. + + Parameters + ---------- + excel_list : Optional[List], optional + A list containing the names of Excel columns. Defaults to None. + + Returns + ------- + None + + """ + + print("-" * 20) + + print("Index - Excel Name") + + for i in range(len(excel_list)): + print(i + 1, "-", excel_list[i]) + + def show_data_columns(columns_name: pd.Index, columns_index: Optional[List] = None) -> None: """Show the column names of the data set. diff --git a/geochemistrypi/data_mining/utils/toggle_address_status.py b/geochemistrypi/data_mining/utils/toggle_address_status.py new file mode 100644 index 00000000..4d968725 --- /dev/null +++ b/geochemistrypi/data_mining/utils/toggle_address_status.py @@ -0,0 +1,58 @@ +import os + + +def list_excel_files(directory: str) -> list: + """Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories. + + Parameters + ---------- + directory : str + The path to the directory to search for Excel files. + + Returns + ------- + excel_files : list + A list of file paths for all Excel files found. + + Notes + ----- + (1) The function uses `os.walk` to traverse the directory and its subdirectories. + (2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files. + """ + excel_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"): + excel_files.append(os.path.join(root, file)) + return excel_files + + +def toggle_address_status(status: str = None, training_data_path: str = None) -> list: + """Toggles the training data path and output path based on the provided status. + + Parameters + ---------- + status : str, optional + The status value, which can be "1" or "2". + - "1": Use the input and output paths in command line mode. + - "2": Retrieves all Excel files from the "data" folder on the desktop as the training data path, and sets the output path to the desktop. + training_data_path : str, optional + The path to the training data. This parameter is used when `status` is "1". + + Returns + ------- + paths : list + A list containing the training data path and the output path. + + """ + + if int(status) == 1: + working_path = os.path.dirname(os.getcwd()) + elif int(status) == 2: + desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") + training_data_path = list_excel_files(os.path.join(desktop_path, "data")) + working_path = desktop_path + else: + raise ValueError("Invalid status value. It should be '1' or '2'.") + + return [training_data_path, working_path]