docs: add commands to operate the software on Jupyter Notebook and Go…

…ogle Colab.
ZJUEarthData · Jun 25, 2023 · 1e5b815 · 1e5b815
1 parent 0f15507
commit 1e5b815
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 21 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,6 @@ docs/build
 
 # env
 .env
+
+# mlflow
+mlruns
diff --git a/README.md b/README.md
@@ -29,35 +29,53 @@ Latest Update: follow up by clicking `Starred` and  `Watch` on our [GitHub repos
 
 ## Quick Installation
 
-One instruction to download on command line, such as Terminal on macOS, CMD on Windows.
+One instruction to download on **command line**, such as Terminal on macOS, CMD on Windows.
 ```
 pip install geochemistrypi
 ```
+One instruction to download on **Jupyter Notebook** or **Google Colab**.
+```
+!pip install geochemistrypi
+```
 **Note**: The beta version runs on MacOS, Windows or Linux.
 
 ## Quick Update
-One instruction to update the software to the latest version on command line, such as Terminal on macOS, CMD on Windows.
+One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, CMD on Windows.
 ```
 pip install --upgrade geochemistrypi
 ```
+One instruction to download on **Jupyter Notebook** or **Google Colab**.
+```
+!pip install --upgrade geochemistrypi
+```
 
 ## Example
 
-**How to run:** After successfully downloading, run this instruction on command line whatever directory it is.
+**How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is.
 
 ### Case 1: Run with built-in data set for testing
+On command line:
 ```
 geochemistrypi data-mining
 ```
+On Jupyter Notebook / Google Colab:
+```
+!geochemistrypi data-mining
+```
 **Note**: There are four built-in data sets corresponding to four kinds of model pattern.
 
 ### Case 2: Run with your own data set
+On command line:
 ```
 geochemistrypi data-mining --data your_own_data_set.xlsx
 ```
-**Note**: Currently, only `.xlsx` file is supported. Please specify the path your data file exists.
+On Jupyter Notebook / Google Colab:
+```
+!geochemistrypi data-mining --data your_own_data_set.xlsx
+```
+**Note**: Currently, only `.xlsx` file is supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first.
 
-For more details: Please refer to
+For more details: Please refer to:
 + [Manual v1.1.0 for Geochemistry π - Beta (International - Google drive)](https://drive.google.com/file/d/1yryykCyWKM-Sj88fOYbOba6QkB_fu2ws/view?usp=sharing)
 + [Manual v1.1.0 for Geochemistry π - Beta (China - Tencent Docs)](https://docs.qq.com/pdf/DQ0l5d2xVd2VwcnVW?&u=6868f96d4a384b309036e04e637e367a)
 + [Geochemistry π - Download and Run the Beta Version (International - Youtube)](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9)
@@ -113,7 +131,7 @@ The whole package is under construction and the documentation is progressively e
 + Bailun Jiang (EPSI / Lille University, France)
 + Yucheng Yan (Andy, University of Sydney)
 + Keran Li (Kirk, Chengdu University of Technology)
-+ Mengying Ye (Jilin University, Changchun, China)
++ Mengying Ye (Mary, Jilin University, Changchun, China)
 
 
 ## Join Us :)

diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
@@ -51,16 +51,18 @@ def cli_pipeline(file_name: str) -> None:
     logger = log(OUTPUT_PATH, "inner_test.log")
     logger.info("Geochemistry Py v.1.0.0 - beta version")
 
-    # If the argument is False, hide all Python level warnings.
+    # If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True.
     show_warning(False)
 
-    # Read the data
-    logger.debug("Data Uploaded")
-    print("-*-*- Data Loading -*-*-")
+    # Data Loading
+    logger.debug("User Data Uploaded")
+    print("-*-*- User Data Loading -*-*-")
     if file_name:
+        # If the user provides the file name, then load the data from the file.
         data = read_data(file_name=file_name, is_own_data=1)
         print(f"Successfully load the data set '{file_name}'.")
     else:
+        # If the user doesn't provide the file name, then load the built-in data set.
         print("Built-in Data Option:")
         num2option(TEST_DATA_OPTION)
         test_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input)
@@ -77,11 +79,12 @@ def cli_pipeline(file_name: str) -> None:
     show_data_columns(data.columns)
     clear_output()
 
-    # World map projection for a specific element
+    # World Map Projection (Optional)
     logger.debug("World Map Projection")
-    print("-*-*- World Map -*-*-")
+    print("-*-*- World Map Projection -*-*-")
     map_flag = 0
     is_map_projection = 0
+    # TODO: Abstract the following code of checking the existence of the longitude and latitude columns into a function.
     detection_index = 0
     lon = ["LONGITUDE", "Longitude (°E)", "longitude", "Longitude", "经度 (°N)", "经度"]
     lat = ["LATITUDE", "Latitude (°N)", "latitude", "Latitude", "纬度 (°E)", "纬度"]
@@ -100,14 +103,16 @@ def cli_pipeline(file_name: str) -> None:
     if detection_index != 3:
         print("Hence, world map projection functionality will be skipped!")
         clear_output()
+    # If the data set contains both longitude and latitude data, then the user can choose to project the data on the world map.
     while detection_index == 3:
         if map_flag != 1:
-            # option selection
+            # Check if the user wants to project the data on the world map.
             print("World Map Projection for A Specific Element Option:")
             num2option(OPTION)
             is_map_projection = limit_num_input(OPTION, SECTION[3], num_input)
             clear_output()
         if is_map_projection == 1:
+            # If the user chooses to project the data on the world map, then the user can select the element to be projected.
             print("-*-*- Distribution in World Map -*-*-")
             print("Select one of the elements below to be projected in the World Map: ")
             show_data_columns(data.columns)
@@ -131,7 +136,7 @@ def cli_pipeline(file_name: str) -> None:
         elif is_map_projection == 2:
             break
 
-    # Create the processing data set
+    # Data Selection for Preprocessing
     logger.debug("Data Selection")
     print("-*-*- Data Selection -*-*-")
     show_data_columns(data.columns)
@@ -148,7 +153,7 @@ def cli_pipeline(file_name: str) -> None:
     logged_distribution_plot(data_processed.columns, data_processed)
     clear_output()
 
-    # Imputing
+    # Imputing Missing Values
     logger.debug("Imputation")
     print("-*-*- Imputation -*-*-")
     is_null_value(data_processed)
@@ -189,7 +194,7 @@ def cli_pipeline(file_name: str) -> None:
         # if the selected data set doesn't need imputation, which means there are no missing values.
         data_processed_imputed = data_processed
 
-    # Feature engineering
+    # Feature Engineering
     # FIXME(hecan [email protected]): fix the logic
     logger.debug("Feature Engineering")
     print("-*-*- Feature Engineering -*-*-")
@@ -233,16 +238,16 @@ def cli_pipeline(file_name: str) -> None:
             clear_output()
             break
 
-    # Mode selection
+    # Mode Selection
     logger.debug("Mode Selection")
-    print("-*-*- Mode Options -*-*-")
+    print("-*-*- Mode Selection -*-*-")
     num2option(MODE_OPTION)
     mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
     clear_output()
     # divide X and y data set when it is supervised learning
     logger.debug("Data Split")
     if mode_num == 1 or mode_num == 2:
-        print("-*-*- Data Split - X Set and Y Set-*-*-")
+        print("-*-*- Data Split - X Set and Y Set -*-*-")
         print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
         # create X data set
         print("Selected sub data set to create X data set:")
@@ -257,7 +262,7 @@ def cli_pipeline(file_name: str) -> None:
         save_data(X, "X Without Scaling", DATASET_OUTPUT_PATH)
         clear_output()
 
-        # feature scaling
+        # Feature Scaling
         print("-*-*- Feature Scaling on X Set -*-*-")
         num2option(OPTION)
         is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input)
@@ -313,7 +318,7 @@ def cli_pipeline(file_name: str) -> None:
         X_train = data_processed_imputed
         y, X_test, y_train, y_test = None, None, None, None
 
-    # Model option for users
+    # Model Selection
     logger.debug("Model Selection")
     print("-*-*- Model Selection -*-*-:")
     Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}