From 86f30fb32081ae03166e515210b9cfb9e2952e91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Zalewski?= <ashlevegalaxy@gmail.com>
Date: Tue, 28 Sep 2021 20:07:40 +0200
Subject: [PATCH] Release/1.1 (#174)

* introduce different running modes: default, debug, experiment
* fix pytorch installation in setup_conda.sh
* fix incorrect calculation of precision, recall and f1 score in wandb callback
* add `_self_` to config.yaml for compatibility with hydra1.1
* fix setting seed in `train.py` so it's skipped when `seed=null`
* add exception message when trying to use wandb callbacks with `trainer.fast_dev_run=true`
* change `axis=-1` to `dim=-1` in LogImagePredictions callback
* add 'Reproducibilty' section to README.md
---
 README.md                                | 61 +++++++++++++-----------
 bash/setup_conda.sh                      | 37 ++++++++------
 configs/config.yaml                      | 19 ++++----
 configs/experiment/example_simple.yaml   |  2 +-
 configs/hparams_search/mnist_optuna.yaml |  1 -
 configs/hydra/default.yaml               | 12 -----
 configs/logger/comet.yaml                |  4 +-
 configs/logger/csv.yaml                  |  2 +-
 configs/logger/many_loggers.yaml         |  1 -
 configs/logger/mlflow.yaml               |  2 +-
 configs/logger/neptune.yaml              |  2 +-
 configs/logger/tensorboard.yaml          |  2 +-
 configs/logger/wandb.yaml                |  2 +-
 configs/mode/debug.yaml                  | 15 ++++++
 configs/mode/default.yaml                | 11 +++++
 configs/mode/exp.yaml                    | 15 ++++++
 src/callbacks/wandb_callbacks.py         | 13 +++--
 src/train.py                             |  2 +-
 src/utils/utils.py                       |  9 +++-
 19 files changed, 131 insertions(+), 81 deletions(-)
 delete mode 100644 configs/hydra/default.yaml
 create mode 100644 configs/mode/debug.yaml
 create mode 100644 configs/mode/default.yaml
 create mode 100644 configs/mode/exp.yaml
diff --git a/README.md b/README.md
index cbdc7a3c7..1fbd371a4 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ The directory structure of new project looks like this:
 │   ├── datamodule              <- Datamodule configs
 │   ├── experiment              <- Experiment configs
 │   ├── hparams_search          <- Hyperparameter search configs
-│   ├── hydra                   <- Hydra related configs
+│   ├── mode                    <- Running mode configs
 │   ├── logger                  <- Logger configs
 │   ├── model                   <- Model configs
 │   ├── trainer                 <- Trainer configs
@@ -150,7 +150,6 @@ python run.py trainer.max_epochs=20 model.lr=1e-4
 > You can also add new parameters with `+` sign.
 ```yaml
 python run.py +model.new_param="uwu"
-
 ```
 
 </details>
@@ -217,6 +216,21 @@ python run.py logger=wandb
 </details>
 
 
+<details>
+<summary><b>Use different logging modes</b></summary>
+
+```yaml
+# debug mode changes logging folder to `logs/debug/`
+python run.py mode=debug
+
+# experiment mode changes logging folder to `logs/experiments/name_of_your_experiment/`
+# also sets custom experiment name in the logger
+python run.py mode=exp name='my_new_experiment_253'
+```
+
+</details>
+
+
 <details>
 <summary><b>Train model with chosen experiment config</b></summary>
 
@@ -269,7 +283,7 @@ python run.py +trainer.max_time="00:12:00:00"
 
 ```yaml
 # run 1 train, val and test loop, using only 1 batch
-python run.py debug=true
+python run.py trainer.fast_dev_run=true
 
 # print full weight summary of all PyTorch modules
 python run.py trainer.weights_summary="full"
@@ -348,12 +362,12 @@ python run.py -m 'experiment=glob(*)'
 
 </details>
 
-<details>
+<!-- <details>
 <summary><b>Execute sweep on a SLURM cluster</b></summary>
 
 > This should be achievable with either [the right lightning trainer flags](https://pytorch-lightning.readthedocs.io/en/latest/clouds/cluster.html?highlight=SLURM#slurm-managed-cluster) or simple config using [Submitit launcher for Hydra](https://hydra.cc/docs/plugins/submitit_launcher). Example is not yet implemented in this template.
 
-</details>
+</details> -->
 
 
 <details>
@@ -433,7 +447,7 @@ defaults:
     - callbacks: default.yaml  # set this to null if you don't want to use callbacks
     - logger: null  # set logger here or use command line (e.g. `python run.py logger=wandb`)
 
-    - hydra: default.yaml
+    - mode: default.yaml
 
     - experiment: null
     - hparams_search: null
@@ -598,13 +612,13 @@ By default, logs have the following structure:
 │
 ```
 
-You can change this structure by modifying paths in [hydra configuration](configs/hydra/default.yaml).
+You can change this structure by modifying paths in [hydra configuration](configs/mode).
 <br><br>
 
 
 ### Experiment Tracking
 PyTorch Lightning supports the most popular logging frameworks:<br>
-**[Weights&Biases](https://www.wandb.com/) · [Neptune](https://neptune.ai/) · [Comet](https://www.comet.ml/) · [MLFlow](https://mlflow.org) · [Aim](https://github.com/aimhubio/aim) · [Tensorboard](https://www.tensorflow.org/tensorboard/)**
+**[Weights&Biases](https://www.wandb.com/) · [Neptune](https://neptune.ai/) · [Comet](https://www.comet.ml/) · [MLFlow](https://mlflow.org) · [Tensorboard](https://www.tensorflow.org/tensorboard/)**
 
 These tools help you keep track of hyperparameters and output metrics and allow you to compare and visualize results. To use one of them simply complete its configuration in [configs/logger](configs/logger) and run:
  ```yaml
@@ -684,7 +698,7 @@ hydra:
 </details>
 
 Next, you can execute it with: `python run.py -m hparams_search=mnist_optuna`<br>
-Using this approach doesn't require you to add any boilerplate into your pipeline, everything is defined in a single config file. You can use different optimization frameworks integrated with Hydra, like Optuna, Ax or Nevergrad.
+Using this approach doesn't require you to add any boilerplate into your pipeline, everything is defined in a single config file. You can use different optimization frameworks integrated with Hydra, like Optuna, Ax or Nevergrad. The `optimization_results.yaml` will be available under `logs/multirun` folder.
 <br><br>
 
 
@@ -801,23 +815,14 @@ python run.py trainer.gpus=4 +trainer.accelerator="ddp"
 <br><br>
 
 
-### Extra Features
-List of extra utilities available in the template:
-- loading environment variables from [.env](.env.example) file
-- pretty printing config with [Rich](https://github.com/willmcgugan/rich) library
-- disabling python warnings
-- debug mode
-<!-- - (TODO) resuming latest run -->
-
-You can easily remove any of those by modifying [run.py](run.py) and [src/train.py](src/train.py).
+### Reproducibility
+To reproduce previous experiment, simply load its config from logs:
+```yaml
+python run.py --config-path /logs/runs/.../.hydra/ --config-name config.yaml
+```
+The `config.yaml` from `.hydra` folder contains all overriden parameters and sections.
 <br><br>
 
-<!--
-### Limitations
-(TODO)
-<br><br><br>
- -->
-
 
 ## Best Practices
 <!--<details>
@@ -1076,10 +1081,10 @@ eval "$(python run.py -sc install=bash)"
 
 # enable aliases for debugging
 alias test='pytest'
-alias debug1='python run.py debug=true'
-alias debug2='python run.py trainer.gpus=1 trainer.max_epochs=1'
-alias debug3='python run.py trainer.gpus=1 trainer.max_epochs=1 +trainer.limit_train_batches=0.1'
-alias debug_wandb='python run.py trainer.gpus=1 trainer.max_epochs=1 logger=wandb logger.wandb.project=tests'
+alias debug1='python run.py mode=debug'
+alias debug2='python run.py mode=debug trainer.fast_dev_run=false trainer.gpus=1 trainer.max_epochs=1'
+alias debug3='python run.py mode=debug trainer.fast_dev_run=false trainer.gpus=1 trainer.max_epochs=1 +trainer.limit_train_batches=0.1'
+alias debug_wandb='python run.py mode=debug trainer.fast_dev_run=false trainer.gpus=1 trainer.max_epochs=1 logger=wandb logger.wandb.project=tests'
 ```
 (these commands will be executed whenever you're openning or switching terminal to folder containing `.autoenv` file)
 
diff --git a/bash/setup_conda.sh b/bash/setup_conda.sh
index 268a6e9b2..5e0c4e8f2 100644
--- a/bash/setup_conda.sh
+++ b/bash/setup_conda.sh
@@ -1,33 +1,40 @@
 #!/bin/bash
-# Run from root folder with: bash bash/setup_conda.sh
 
-# check if conda is installed
+
+# Check if conda is installed
 if ! command -v conda &> /dev/null
 then
     echo "The 'conda' command could not be found. Exiting..."
     exit
 fi
 
-# This line is needed for enabling conda env activation
-source ~/miniconda3/etc/profile.d/conda.sh
 
-# Configure conda env
-read -rp "Enter environment name: " env_name
+# Configure env
+read -rp "Enter conda environment name: " env_name
 read -rp "Enter python version (recommended '3.8') " python_version
-read -rp "Enter cuda version (recommended '10.2', or 'none' for CPU only): " cuda_version
-read -rp "Enter pytorch version (recommended '1.8.1'): " pytorch_version
+read -rp "Enter cuda version ('10.2', '11.1' or 'none' for CPU only): " cuda_version
+
 
-# Create conda env
+# Create env
 conda create -y -n "$env_name" python="$python_version"
-conda activate "$env_name"
 
-# Install pytorch
+
+# Install pytorch + cuda
 if [ "$cuda_version" == "none" ]; then
-    conda install -y pytorch=$pytorch_version torchvision torchaudio cpuonly -c pytorch
+    conda install -n "$env_name" -y pytorch torchvision torchaudio cpuonly -c pytorch
+elif [ "$cuda_version" == "10.2" ]; then
+    conda install -n "$env_name" pytorch torchvision torchaudio cudatoolkit=$cuda_version -c pytorch
+elif [ "$cuda_version" == "11.1" ]; then
+    conda install -n "$env_name" pytorch torchvision torchaudio cudatoolkit=$cuda_version -c pytorch -c nvidia
 else
-    conda install -y pytorch=$pytorch_version torchvision torchaudio cudatoolkit=$cuda_version -c pytorch
+    echo "Incorrect cuda version. Exiting..."
+    exit
 fi
 
-echo "\n"
-echo "To activate this environment, use:"
+
+# Final message
+echo "======================================="
+echo "To activate this environment use:"
 echo "conda activate $env_name"
+echo "======================================="
+echo -e "\a"
diff --git a/configs/config.yaml b/configs/config.yaml
index d282fb2eb..1b9691161 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -2,17 +2,18 @@
 
 # specify here default training configuration
 defaults:
+  - _self_
   - trainer: default.yaml
   - model: mnist_model.yaml
   - datamodule: mnist_datamodule.yaml
-  - callbacks: default.yaml # set this to null if you don't want to use callbacks
+  - callbacks: default.yaml
   - logger: null # set logger here or use command line (e.g. `python run.py logger=wandb`)
 
+  - mode: default.yaml
+
   - experiment: null
   - hparams_search: null
 
-  - hydra: default.yaml
-
   # enable color logging
   - override hydra/hydra_logging: colorlog
   - override hydra/job_logging: colorlog
@@ -26,12 +27,6 @@ work_dir: ${hydra:runtime.cwd}
 # path to folder with data
 data_dir: ${work_dir}/data/
 
-# use `python run.py debug=true` for easy debugging!
-# this will run 1 train, val and test loop with only 1 batch
-# equivalent to running `python run.py trainer.fast_dev_run=true`
-# (this is placed here just for easier access from command line)
-debug: False
-
 # pretty print config at the start of the run using Rich library
 print_config: True
 
@@ -41,3 +36,9 @@ ignore_warnings: True
 # check performance on test set, using the best model achieved during training
 # lightning chooses best model based on metric specified in checkpoint callback
 test_after_training: True
+
+# seed for random number generators in pytorch, numpy and python.random
+seed: null
+
+# name of the run, accessed by loggers
+name: null
diff --git a/configs/experiment/example_simple.yaml b/configs/experiment/example_simple.yaml
index e2f8126d9..ac6b245cd 100644
--- a/configs/experiment/example_simple.yaml
+++ b/configs/experiment/example_simple.yaml
@@ -4,7 +4,7 @@
 # python run.py experiment=example_simple.yaml
 
 defaults:
-  - override /trainer: default.yaml # choose trainer from 'configs/trainer/'
+  - override /trainer: default.yaml
   - override /model: mnist_model.yaml
   - override /datamodule: mnist_datamodule.yaml
   - override /callbacks: default.yaml
diff --git a/configs/hparams_search/mnist_optuna.yaml b/configs/hparams_search/mnist_optuna.yaml
index 509def3ea..20e43d34d 100644
--- a/configs/hparams_search/mnist_optuna.yaml
+++ b/configs/hparams_search/mnist_optuna.yaml
@@ -3,7 +3,6 @@
 # example hyperparameter optimization of some experiment with Optuna:
 # python run.py -m hparams_search=mnist_optuna experiment=example_simple
 # python run.py -m hparams_search=mnist_optuna experiment=example_simple hydra.sweeper.n_trials=30
-# python run.py -m hparams_search=mnist_optuna experiment=example_simple logger=wandb
 
 defaults:
   - override /hydra/sweeper: optuna
diff --git a/configs/hydra/default.yaml b/configs/hydra/default.yaml
deleted file mode 100644
index a086d12f4..000000000
--- a/configs/hydra/default.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# output paths for hydra logs
-run:
-  dir: logs/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-sweep:
-  dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
-  subdir: ${hydra.job.num}
-
-# you can set here environment variables that are universal for all users
-# for system specific variables (like data paths) it's better to use .env file!
-job:
-  env_set:
-    EXAMPLE_VAR: "example_value"
diff --git a/configs/logger/comet.yaml b/configs/logger/comet.yaml
index 07d2f5d52..6ac99f46c 100644
--- a/configs/logger/comet.yaml
+++ b/configs/logger/comet.yaml
@@ -2,6 +2,6 @@
 
 comet:
   _target_: pytorch_lightning.loggers.comet.CometLogger
-  api_key: ${oc.env:COMET_API_TOKEN} # api key is laoded from environment variable
+  api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
   project_name: "template-tests"
-  experiment_name: null
+  experiment_name: ${name}
diff --git a/configs/logger/csv.yaml b/configs/logger/csv.yaml
index 85a78a2a3..0f917e89c 100644
--- a/configs/logger/csv.yaml
+++ b/configs/logger/csv.yaml
@@ -4,5 +4,5 @@ csv:
   _target_: pytorch_lightning.loggers.csv_logs.CSVLogger
   save_dir: "."
   name: "csv/"
-  version: null
+  version: ${name}
   prefix: ""
diff --git a/configs/logger/many_loggers.yaml b/configs/logger/many_loggers.yaml
index cb5bc0c48..7bc3d6762 100644
--- a/configs/logger/many_loggers.yaml
+++ b/configs/logger/many_loggers.yaml
@@ -1,7 +1,6 @@
 # train with many loggers at once
 
 defaults:
-  # - aim.yaml
   # - comet.yaml
   - csv.yaml
   # - mlflow.yaml
diff --git a/configs/logger/mlflow.yaml b/configs/logger/mlflow.yaml
index c7b4fdabf..bfb3781b1 100644
--- a/configs/logger/mlflow.yaml
+++ b/configs/logger/mlflow.yaml
@@ -2,7 +2,7 @@
 
 mlflow:
   _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger
-  experiment_name: default
+  experiment_name: ${name}
   tracking_uri: null
   tags: null
   save_dir: ./mlruns
diff --git a/configs/logger/neptune.yaml b/configs/logger/neptune.yaml
index e92ba74ea..117af9379 100644
--- a/configs/logger/neptune.yaml
+++ b/configs/logger/neptune.yaml
@@ -6,6 +6,6 @@ neptune:
   project_name: your_name/template-tests
   close_after_fit: True
   offline_mode: False
-  experiment_name: null
+  experiment_name: ${name}
   experiment_id: null
   prefix: ""
diff --git a/configs/logger/tensorboard.yaml b/configs/logger/tensorboard.yaml
index 39386a97f..acd1fa411 100644
--- a/configs/logger/tensorboard.yaml
+++ b/configs/logger/tensorboard.yaml
@@ -4,7 +4,7 @@ tensorboard:
   _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
   save_dir: "tensorboard/"
   name: "default"
-  version: null
+  version: ${name}
   log_graph: False
   default_hp_metric: True
   prefix: ""
diff --git a/configs/logger/wandb.yaml b/configs/logger/wandb.yaml
index 18e2cdaa0..5d6df945c 100644
--- a/configs/logger/wandb.yaml
+++ b/configs/logger/wandb.yaml
@@ -3,7 +3,7 @@
 wandb:
   _target_: pytorch_lightning.loggers.wandb.WandbLogger
   project: "template-tests"
-  name: null
+  name: ${name}
   save_dir: "."
   offline: False # set True to store all logs only locally
   id: null # pass correct id to resume experiment!
diff --git a/configs/mode/debug.yaml b/configs/mode/debug.yaml
new file mode 100644
index 000000000..2cfb62ca5
--- /dev/null
+++ b/configs/mode/debug.yaml
@@ -0,0 +1,15 @@
+# @package _global_
+
+# run in debug mode with:
+# `python run.py mode=debug`
+
+# this flag doesn't really do anything
+debug: true
+
+# output paths for debug mode
+hydra:
+  run:
+    dir: logs/debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: logs/debug/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
diff --git a/configs/mode/default.yaml b/configs/mode/default.yaml
new file mode 100644
index 000000000..65336fae2
--- /dev/null
+++ b/configs/mode/default.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+
+# default running mode
+
+# default output paths for hydra logs
+hydra:
+  run:
+    dir: logs/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
diff --git a/configs/mode/exp.yaml b/configs/mode/exp.yaml
new file mode 100644
index 000000000..f465877a4
--- /dev/null
+++ b/configs/mode/exp.yaml
@@ -0,0 +1,15 @@
+# @package _global_
+
+# run in experiment mode with:
+# python run.py mode=exp name='my_new_experiment_23'
+
+# allows for custom naming of the experiment
+name: ???
+
+# output paths for experiment mode
+hydra:
+  run:
+    dir: logs/experiments/${name}
+  sweep:
+    dir: logs/experiments/${name}
+    subdir: ${hydra.job.num}
diff --git a/src/callbacks/wandb_callbacks.py b/src/callbacks/wandb_callbacks.py
index f8804a4d2..4e0a46523 100644
--- a/src/callbacks/wandb_callbacks.py
+++ b/src/callbacks/wandb_callbacks.py
@@ -16,6 +16,11 @@
 def get_wandb_logger(trainer: Trainer) -> WandbLogger:
     """Safely get Weights&Biases logger from Trainer."""
 
+    if trainer.fast_dev_run:
+        raise Exception(
+            "Cannot use wandb callbacks since pytorch lightning disables loggers in `fast_dev_run=true` mode."
+        )
+
     if isinstance(trainer.logger, WandbLogger):
         return trainer.logger
 
@@ -207,9 +212,9 @@ def on_validation_epoch_end(self, trainer, pl_module):
 
             preds = torch.cat(self.preds).cpu().numpy()
             targets = torch.cat(self.targets).cpu().numpy()
-            f1 = f1_score(preds, targets, average=None)
-            r = recall_score(preds, targets, average=None)
-            p = precision_score(preds, targets, average=None)
+            f1 = f1_score(targets, preds, average=None)
+            r = recall_score(targets, preds, average=None)
+            p = precision_score(targets, preds, average=None)
             data = [f1, p, r]
 
             # set figure size
@@ -267,7 +272,7 @@ def on_validation_epoch_end(self, trainer, pl_module):
             # run the batch through the network
             val_imgs = val_imgs.to(device=pl_module.device)
             logits = pl_module(val_imgs)
-            preds = torch.argmax(logits, axis=-1)
+            preds = torch.argmax(logits, dim=-1)
 
             # log the images as wandb Image
             experiment.log(
diff --git a/src/train.py b/src/train.py
index 7f48ff988..40f0fdd80 100644
--- a/src/train.py
+++ b/src/train.py
@@ -28,7 +28,7 @@ def train(config: DictConfig) -> Optional[float]:
     """
 
     # Set seed for random number generators in pytorch, numpy and python.random
-    if "seed" in config:
+    if config.get("seed"):
         seed_everything(config.seed, workers=True)
 
     # Init lightning datamodule
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 19e4bd1fa..4c019f529 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -46,10 +46,13 @@ def extras(config: DictConfig) -> None:
         log.info("Disabling python warnings! <config.ignore_warnings=True>")
         warnings.filterwarnings("ignore")
 
-    # set <config.trainer.fast_dev_run=True> if <config.debug=True>
+    # if <config.debug=True>
     if config.get("debug"):
         log.info("Running in debug mode! <config.debug=True>")
-        config.trainer.fast_dev_run = True
+
+    # if <config.name=...>
+    if config.get("name"):
+        log.info("Running in experiment mode! Name: {}".format(config.name))
 
     # force debugger friendly configuration if <config.trainer.fast_dev_run=True>
     if config.trainer.get("fast_dev_run"):
@@ -75,7 +78,9 @@ def print_config(
         "datamodule",
         "callbacks",
         "logger",
+        "test_after_training",
         "seed",
+        "name",
     ),
     resolve: bool = True,
 ) -> None: