From c88add7bb06f58085b7b72368dff212acf74a6b2 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Fri, 19 Nov 2021 10:15:24 +0100
Subject: [PATCH 01/56] trying to fix some bugs when selecting part of the
 action space

---
 examples/restrict_actions.py            | 64 +++++++++++++++++++++++++
 l2rpn_baselines/DeepQSimple/train.py    |  1 +
 l2rpn_baselines/DuelQSimple/DuelQ_NN.py |  2 +
 l2rpn_baselines/DuelQSimple/train.py    |  1 +
 l2rpn_baselines/utils/DeepQAgent.py     |  2 +-
 5 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 examples/restrict_actions.py

diff --git a/examples/restrict_actions.py b/examples/restrict_actions.py
new file mode 100644
index 0000000..70b5da5
--- /dev/null
+++ b/examples/restrict_actions.py
@@ -0,0 +1,64 @@
+import shutil
+import numpy as np
+from l2rpn_baselines.DuelQSimple import train
+from l2rpn_baselines.utils import NNParam, TrainingParam
+from grid2op import make
+
+def filter_action_fun(grid2op_act):
+    # filter out all non redispatching actions
+    if np.any(grid2op_act.set_bus != 0):
+        return False
+    if np.any(grid2op_act.change_bus):
+        return False
+    if np.any(grid2op_act.curtail != -1.):
+        return False
+    if np.any(grid2op_act.storage_p != 0):
+        return False
+    if np.any(grid2op_act.line_set_status != 0):
+        return False
+    if np.any(grid2op_act.line_change_status):
+        return False
+    # it should be a redispatching action
+    return True
+
+if __name__ == "__main__":
+
+    train_iter = 1000
+    env_name = "l2rpn_case14_sandbox"
+
+    env = make(env_name)  
+
+
+    agent_name = "test_agent"
+    save_path = "saved_agent_DDDQN_{}".format(train_iter)
+    shutil.rmtree(save_path, ignore_errors=True)
+    logs_dir="tf_logs_DDDQN"
+
+    li_attr_obs_X = ["gen_p", "gen_v", "load_p", "load_q"]
+
+    observation_size = NNParam.get_obs_size(env, li_attr_obs_X) 
+
+    sizes = [300, 300, 300]  # 3 hidden layers, of 300 units each, why not...
+    activs =  ["relu" for _ in sizes]  # all followed by relu activation, because... why not
+
+    kwargs_archi = {'observation_size': observation_size,
+                    'sizes': sizes,
+                    'activs': activs,
+                    "list_attr_obs": li_attr_obs_X}
+
+    # baselines.readthedocs.io/en/latest/utils.html#l2rpn_baselines.utils.TrainingParam
+    tp = TrainingParam()
+    tp.batch_size = 32  # for example...
+    tp.update_tensorboard_freq = int(train_iter / 10)
+    tp.save_model_each = int(train_iter / 3)
+    tp.min_observation = int(train_iter / 5)
+    train(env,
+        name=agent_name,
+        iterations=train_iter,
+        save_path=save_path,
+        load_path=None, # put something else if you want to reload an agent instead of creating a new one
+        logs_dir=logs_dir,
+        kwargs_archi=kwargs_archi,
+        training_param=tp,
+        filter_action_fun=filter_action_fun)
+
diff --git a/l2rpn_baselines/DeepQSimple/train.py b/l2rpn_baselines/DeepQSimple/train.py
index c1020f4..5e2a322 100755
--- a/l2rpn_baselines/DeepQSimple/train.py
+++ b/l2rpn_baselines/DeepQSimple/train.py
@@ -176,6 +176,7 @@ def train(env,
                            name=name,
                            istraining=True,
                            verbose=verbose,
+                           filter_action_fun=filter_action_fun,
                             **kwargs_converters
                             )
 
diff --git a/l2rpn_baselines/DuelQSimple/DuelQ_NN.py b/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
index a3c85d1..73a2f06 100644
--- a/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
+++ b/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
@@ -29,6 +29,8 @@ def __init__(self,
         BaseDeepQ.__init__(self,
                            nn_params,
                            training_param)
+        if self._action_size == 0:
+            raise RuntimeError("Impossible to make a DeepQ network with an action space of size 0!")
         self.construct_q_network()
 
     def construct_q_network(self):
diff --git a/l2rpn_baselines/DuelQSimple/train.py b/l2rpn_baselines/DuelQSimple/train.py
index 0b4d6ae..369dea3 100755
--- a/l2rpn_baselines/DuelQSimple/train.py
+++ b/l2rpn_baselines/DuelQSimple/train.py
@@ -177,6 +177,7 @@ def train(env,
                             name=name,
                             istraining=True,
                             verbose=verbose,
+                            filter_action_fun=filter_action_fun,
                             **kwargs_converters
                             )
 
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index 06f9b39..dd766d5 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -185,7 +185,7 @@ def __init__(self,
         # for the frequency of action type
         self.current_ = 0
         self.nb_ = 10
-        self._nb_this_time = np.zeros((self.nb_, 7))
+        self._nb_this_time = np.zeros((self.nb_, 7), dtype=int)
 
         #
         self._vector_size = None

From 493d324e98196de84bc73a9ca16b909e2af86daa Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Fri, 21 Jan 2022 14:45:16 +0100
Subject: [PATCH 02/56] adding new types of actions in tensorboard

---
 l2rpn_baselines/utils/DeepQAgent.py | 36 +++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index dd766d5..9f75d96 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -100,6 +100,11 @@ class DeepQAgent(AgentWithConverter):
         Number of action tagged as "storage". See the
         `official grid2op documentation <https://grid2op.readthedocs.io/en/v0.9.3/action.html?highlight=get_types#grid2op.Action.BaseAction.get_types>`_
         for more information.
+        
+    nb_curtail: ``int``
+        Number of action tagged as "curtailment". See the
+        `official grid2op documentation <https://grid2op.readthedocs.io/en/v0.9.3/action.html?highlight=get_types#grid2op.Action.BaseAction.get_types>`_
+        for more information.
 
     nb_do_nothing: ``int``
         Number of action tagged as "do_nothing", *ie* when an action is not modifiying the state of the grid. See the
@@ -157,6 +162,7 @@ def __init__(self,
         self.nb_topology = 0
         self.nb_line = 0
         self.nb_redispatching = 0
+        self.nb_curtail = 0
         self.nb_storage = 0
         self.nb_do_nothing = 0
 
@@ -185,7 +191,7 @@ def __init__(self,
         # for the frequency of action type
         self.current_ = 0
         self.nb_ = 10
-        self._nb_this_time = np.zeros((self.nb_, 7), dtype=int)
+        self._nb_this_time = np.zeros((self.nb_, 8), dtype=int)
 
         #
         self._vector_size = None
@@ -471,6 +477,7 @@ def train(self,
         self.nb_topology = 0
         self.nb_line = 0
         self.nb_redispatching = 0
+        self.nb_curtail = 0
         self.nb_storage = 0
         self.nb_do_nothing = 0
 
@@ -569,23 +576,27 @@ def _create_action_if_not_registered(self, action_int):
         """make sure that `action_int` is present in dict_action"""
         if action_int not in self.dict_action:
             act = self.action_space.all_actions[action_int]
-            is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn = \
-                False, False, False, False, False, False, False
+            is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn, is_curtail = \
+                False, False, False, False, False, False, False, False
             try:
                 # feature unavailble in grid2op <= 0.9.2
                 try:
                     # storage introduced in grid2op 1.5.0 so if below it is not supported
                     is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types()
                 except ValueError as exc_:
-                    is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage = act.get_types()
+                    try:
+                        is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage = act.get_types()
+                    except ValueError as exc_:
+                        is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_curtail = act.get_types()
 
                 is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp)
                 is_dn = is_dn and (not is_storage)
+                is_dn = is_dn and (not is_curtail)
             except Exception as exc_:
                 pass
 
             self.dict_action[action_int] = [0, act,
-                                            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn)]
+                                            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_curtail, is_dn)]
 
     def _store_action_played(self, action_int):
         """if activated, this function will store the action taken by the agent."""
@@ -593,7 +604,7 @@ def _store_action_played(self, action_int):
             self._create_action_if_not_registered(action_int)
 
             self.dict_action[action_int][0] += 1
-            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn) = self.dict_action[action_int][2]
+            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_curtail, is_dn) = self.dict_action[action_int][2]
             if is_inj:
                 self.nb_injection += 1
             if is_volt:
@@ -606,6 +617,9 @@ def _store_action_played(self, action_int):
                 self.nb_redispatching += 1
             if is_storage:
                 self.nb_storage += 1
+                self.nb_redispatching += 1
+            if is_curtail:
+                self.nb_curtail += 1
             if is_dn:
                 self.nb_do_nothing += 1
 
@@ -995,13 +1009,14 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
     def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
         self.current_ += 1
         self.current_ %= self.nb_
-        nb_inj, nb_volt, nb_topo, nb_line, nb_redisp, nb_storage, nb_dn = self._nb_this_time[self.current_, :]
+        nb_inj, nb_volt, nb_topo, nb_line, nb_redisp, nb_storage, nb_curtail, nb_dn = self._nb_this_time[self.current_, :]
         self._nb_this_time[self.current_, :] = [self.nb_injection,
                                                 self.nb_voltage,
                                                 self.nb_topology,
                                                 self.nb_line,
                                                 self.nb_redispatching,
                                                 self.nb_storage,
+                                                self.nb_curtail,
                                                 self.nb_do_nothing]
 
         curr_inj = self.nb_injection - nb_inj
@@ -1010,6 +1025,7 @@ def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
         curr_line = self.nb_line - nb_line
         curr_redisp = self.nb_redispatching - nb_redisp
         curr_storage = self.nb_storage - nb_storage
+        curr_curtail = self.nb_curtail - nb_curtail
         curr_dn = self.nb_do_nothing - nb_dn
 
         total_act_num = curr_inj + curr_volt + curr_topo + curr_line + curr_redisp + curr_dn + curr_storage
@@ -1055,3 +1071,9 @@ def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
                           description="Frequency of \"storage\" actions "
                                       "type played over the last {} actions"
                                       "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("z_freq_curtail",
+                          curr_curtail / total_act_num,
+                          step_tb,
+                          description="Frequency of \"curtailment\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
\ No newline at end of file

From 0eb2a17c30cd1bb99ad9b0873798a1115a7800b4 Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Mon, 24 Jan 2022 10:17:25 +0100
Subject: [PATCH 03/56] adding some files to prepare examples for stable
 baselines and ray and other frameworks

---
 l2rpn_baselines/DeepQSimple/evaluate.py    |  6 +-
 l2rpn_baselines/DuelQLeapNet/evaluate.py   |  6 +-
 l2rpn_baselines/DuelQSimple/evaluate.py    | 11 +++-
 l2rpn_baselines/Kaist                      |  2 +-
 l2rpn_baselines/LeapNetEncoded/evaluate.py |  8 ++-
 l2rpn_baselines/utils/DeepQAgent.py        | 20 +++----
 l2rpn_baselines/utils/gymAgent.py          | 26 +++++++++
 test_Eva.py                                | 65 ++++++++++++++++++++++
 test_eva_dn.py                             | 23 ++++++++
 test_stable_baselines.py                   | 54 ++++++++++++++++++
 10 files changed, 202 insertions(+), 19 deletions(-)
 create mode 100644 l2rpn_baselines/utils/gymAgent.py
 create mode 100644 test_Eva.py
 create mode 100644 test_eva_dn.py
 create mode 100644 test_stable_baselines.py

diff --git a/l2rpn_baselines/DeepQSimple/evaluate.py b/l2rpn_baselines/DeepQSimple/evaluate.py
index 4a10f7b..a8a90fd 100644
--- a/l2rpn_baselines/DeepQSimple/evaluate.py
+++ b/l2rpn_baselines/DeepQSimple/evaluate.py
@@ -36,7 +36,8 @@ def evaluate(env,
              nb_process=DEFAULT_NB_PROCESS,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=False,
-             save_gif=False):
+             save_gif=False,
+             filter_action_fun=None):
     """
     How to evaluate the performances of the trained DeepQSimple agent.
 
@@ -128,7 +129,8 @@ def evaluate(env,
                         name=name,
                         store_action=nb_process == 1,
                         nn_archi=nn_archi,
-                        observation_space=env.observation_space)
+                        observation_space=env.observation_space,
+                        filter_action_fun=filter_action_fun)
 
     # Load weights from file
     agent.load(load_path)
diff --git a/l2rpn_baselines/DuelQLeapNet/evaluate.py b/l2rpn_baselines/DuelQLeapNet/evaluate.py
index 54da8cb..d2d7c22 100644
--- a/l2rpn_baselines/DuelQLeapNet/evaluate.py
+++ b/l2rpn_baselines/DuelQLeapNet/evaluate.py
@@ -38,7 +38,8 @@ def evaluate(env,
              nb_process=DEFAULT_NB_PROCESS,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=False,
-             save_gif=False):
+             save_gif=False,
+             filter_action_fun=None):
     """
     How to evaluate the performances of the trained DeepQSimple agent.
 
@@ -131,7 +132,8 @@ def evaluate(env,
                          name=name,
                          store_action=nb_process == 1,
                          nn_archi=nn_archi,
-                         observation_space=env.observation_space)
+                         observation_space=env.observation_space,
+                         filter_action_fun=filter_action_fun)
 
     # Load weights from file
     agent.load(load_path)
diff --git a/l2rpn_baselines/DuelQSimple/evaluate.py b/l2rpn_baselines/DuelQSimple/evaluate.py
index e539aaf..52acc9c 100644
--- a/l2rpn_baselines/DuelQSimple/evaluate.py
+++ b/l2rpn_baselines/DuelQSimple/evaluate.py
@@ -37,7 +37,8 @@ def evaluate(env,
              nb_process=DEFAULT_NB_PROCESS,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=False,
-             save_gif=False):
+             save_gif=False,
+             filter_action_fun=None):
     """
     How to evaluate the performances of the trained DuelQSimple agent.
 
@@ -72,6 +73,11 @@ def evaluate(env,
         Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
         take a lot of ram) and drastically increase computation time.
 
+    filter_action_fun: ``function``
+        A function to filter the action space. See
+        `IdToAct.filter_action <https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.filter_action>`_
+        documentation.
+
     Returns
     -------
     agent: :class:`l2rpn_baselines.utils.DeepQAgent`
@@ -130,7 +136,8 @@ def evaluate(env,
                         name=name,
                         store_action=nb_process == 1,
                         nn_archi=nn_archi,
-                        observation_space=env.observation_space)
+                        observation_space=env.observation_space,
+                        filter_action_fun=filter_action_fun)
 
     # Load weights from file
     agent.load(load_path)
diff --git a/l2rpn_baselines/Kaist b/l2rpn_baselines/Kaist
index b2b6561..71c49e7 160000
--- a/l2rpn_baselines/Kaist
+++ b/l2rpn_baselines/Kaist
@@ -1 +1 @@
-Subproject commit b2b6561a2cc3afbf03fd13ef6d1b334e4ec6c98a
+Subproject commit 71c49e73ace272fd6d8258a5295abc2b8d3bea1b
diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py
index 0d95e8f..fd45852 100644
--- a/l2rpn_baselines/LeapNetEncoded/evaluate.py
+++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py
@@ -37,7 +37,8 @@ def evaluate(env,
              nb_process=DEFAULT_NB_PROCESS,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=False,
-             save_gif=False):
+             save_gif=False,
+             filter_action_fun=None):
     """
     How to evaluate the performances of the trained DeepQSimple agent.
 
@@ -130,7 +131,8 @@ def evaluate(env,
                          name=name,
                          store_action=nb_process == 1,
                          nn_archi=nn_archi,
-                         observation_space=env.observation_space)
+                         observation_space=env.observation_space,
+                         filter_action_fun=filter_action_fun)
 
     # Load weights from file
     agent.load(load_path)
@@ -208,3 +210,5 @@ def evaluate(env,
              max_steps=args.max_steps,
              verbose=args.verbose,
              save_gif=args.save_gif)
+
+
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index 9f75d96..5a78346 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -995,16 +995,16 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
                 if self.store_action:
                     self._store_frequency_action_type(UPDATE_FREQ, step_tb)
 
-                if self._time_step_lived is not None:
-                    tf.summary.histogram(
-                        "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
-                        description="Number of time steps lived for all scenarios"
-                    )
-                if self._nb_chosen is not None:
-                    tf.summary.histogram(
-                        "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
-                        description="Number of times this scenarios has been played"
-                    )
+                # if self._time_step_lived is not None:
+                #     tf.summary.histogram(
+                #         "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
+                #         description="Number of time steps lived for all scenarios"
+                #     )
+                # if self._nb_chosen is not None:
+                #     tf.summary.histogram(
+                #         "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
+                #         description="Number of times this scenarios has been played"
+                #     )
 
     def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
         self.current_ += 1
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
new file mode 100644
index 0000000..c5b4ba2
--- /dev/null
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+from grid2op.Agent import BaseAgent
+
+
+class GymAgent(BaseAgent):
+    """
+    This class maps a neural network (trained using ray / rllib or stable baselines for example
+    
+    It can then be used as a "regular" grid2op agent, in a runner, grid2viz, grid2game etc.
+
+    It is also compatible with the "l2rpn baselines" interface.
+    """
+    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path):
+        super().__init__(g2op_action_space)
+        self._gym_act_space = gym_act_space
+        self._gym_obs_space = gym_obs_space
+        self._nn_path = nn_path
+
+    
\ No newline at end of file
diff --git a/test_Eva.py b/test_Eva.py
new file mode 100644
index 0000000..4508cae
--- /dev/null
+++ b/test_Eva.py
@@ -0,0 +1,65 @@
+import json
+import os
+import grid2op
+import re
+from grid2op.Reward import L2RPNReward, EpisodeDurationReward
+from l2rpn_baselines.utils import TrainingParam, NNParam
+from l2rpn_baselines.DuelQSimple import train
+from lightsim2grid import LightSimBackend
+from grid2op.Chronics import MultifolderWithCache
+
+# define the environment
+env = grid2op.make("l2rpn_case14_sandbox",
+                    reward_class=EpisodeDurationReward,
+                    backend=LightSimBackend(),
+                    chronics_class=MultifolderWithCache)
+
+env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+env.chronics_handler.real_data.reset()
+
+# use the default training parameters
+tp = TrainingParam()
+
+# this will be the list of what part of the observation I want to keep
+# more information on https://grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes
+li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                 "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                 "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+
+# neural network architecture
+observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
+sizes = [800, 494, 494]  # sizes of each hidden layers
+kwargs_archi = {'observation_size': observation_size,
+                'sizes': sizes,
+                'activs': ["relu" for _ in sizes],  # all relu activation function
+                "list_attr_obs": li_attr_obs_X}
+li_act_path = "line_act.json"
+if os.path.exists(li_act_path):
+    with open(li_act_path, "r", encoding="utf-8") as f:
+        all_acts = json.load(f)
+else:
+    all_acts = [env.action_space().as_serializable_dict()]
+    for el in range(env.n_line):
+        all_acts.append(env.action_space({"set_line_status" : [(el, -1)]}).as_serializable_dict())
+        all_acts.append(env.action_space({"set_line_status" : [(el, +1)]}).as_serializable_dict())
+    
+    with open(li_act_path, "w", encoding="utf-8") as f:
+       json.dump(fp=f, obj=all_acts)
+
+# select some part of the action
+# more information at https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.init_converter
+kwargs_converters = {"all_actions": all_acts }
+# define the name of the model
+nm_ = "AnneOnymous6"
+try:
+    train(env,
+          name=nm_,
+          iterations=1_000_000,
+          save_path="./saved_agents",
+          load_path=None,
+          logs_dir="./logs",
+          training_param=tp,
+          kwargs_converters=kwargs_converters,
+          kwargs_archi=kwargs_archi)
+finally:
+    env.close()
diff --git a/test_eva_dn.py b/test_eva_dn.py
new file mode 100644
index 0000000..6c224d5
--- /dev/null
+++ b/test_eva_dn.py
@@ -0,0 +1,23 @@
+import pdb
+import grid2op
+import re
+import os
+from grid2op.Reward import L2RPNReward
+from grid2op.Runner import Runner
+from lightsim2grid import LightSimBackend
+from grid2op.Chronics import MultifolderWithCache
+
+# define the environment
+env = grid2op.make("l2rpn_case14_sandbox",
+                    reward_class=L2RPNReward,
+                    backend=LightSimBackend(),
+                    # chronics_class=MultifolderWithCache
+                    )
+
+# env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+# env.chronics_handler.real_data.reset()
+runner = Runner(**env.get_params_for_runner())
+res = runner.run(nb_episode=10, episode_id=["0000", "0100", "0200", "0300", "0400", "0500", "0600", "0700", "0800", "0900"])
+pdb.set_trace()
+{'0000': 1091, '0100': 1097, '0300': 1096,  '0400': 2828, '0500': 514, '0600': 1091, '0700': 717, '0800': 513, '0900': 381}
+# mean time survived: 1036.4444444444443
\ No newline at end of file
diff --git a/test_stable_baselines.py b/test_stable_baselines.py
new file mode 100644
index 0000000..b11d28c
--- /dev/null
+++ b/test_stable_baselines.py
@@ -0,0 +1,54 @@
+import gym
+import json
+import os
+import grid2op
+import re
+from grid2op.Reward import L2RPNReward, EpisodeDurationReward
+from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
+from l2rpn_baselines.DuelQSimple import train
+from lightsim2grid import LightSimBackend
+from grid2op.Chronics import MultifolderWithCache
+from stable_baselines3 import PPO
+
+# define the environment
+env = grid2op.make("l2rpn_case14_sandbox",
+                    reward_class=EpisodeDurationReward,
+                    backend=LightSimBackend(),
+                    chronics_class=MultifolderWithCache)
+
+env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+env.chronics_handler.real_data.reset()
+env_gym = GymEnv(env)
+
+li_act_path = "line_act.json"
+if os.path.exists(li_act_path):
+    with open(li_act_path, "r", encoding="utf-8") as f:
+        all_acts = json.load(f)
+else:
+    all_acts = [env.action_space().as_serializable_dict()]
+    for el in range(env.n_line):
+        all_acts.append(env.action_space({"set_line_status" : [(el, -1)]}).as_serializable_dict())
+        all_acts.append(env.action_space({"set_line_status" : [(el, +1)]}).as_serializable_dict())
+
+env_gym.action_space = DiscreteActSpace(env.action_space, action_list= all_acts)
+
+li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                 "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                 "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+env_gym.observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=li_attr_obs_X)
+
+# learn
+model = PPO("MlpPolicy", env_gym, verbose=1, tensorboard_log="./logs")
+model.learn(total_timesteps=100_000)
+
+
+# test
+obs = env_gym.reset()
+for i in range(1000):
+    action, _states = model.predict(obs, deterministic=True)
+    obs, reward, done, info = env_gym.step(action)
+    if done:
+        print(f"{reward=}")
+        obs = env_gym.reset()
+
+env.close()
\ No newline at end of file

From 53110f876acb9b9d79790b28488698b4ac683b67 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 24 Jan 2022 17:00:12 +0100
Subject: [PATCH 04/56] adding a stable baseline based baseline, need to do
 tests and docs

---
 .gitignore                                    |   3 +
 l2rpn_baselines/Kaist                         |   2 +-
 .../ppo_stablebaselines/__init__.py           |   0
 .../ppo_stablebaselines/evaluate.py           | 128 ++++++++
 l2rpn_baselines/ppo_stablebaselines/train.py  | 285 ++++++++++++++++++
 l2rpn_baselines/ppo_stablebaselines/utils.py  |  27 ++
 l2rpn_baselines/utils/__init__.py             |   4 +-
 l2rpn_baselines/utils/gymAgent.py             |  29 +-
 test_Eva.py                                   |  65 ----
 test_eva_dn.py                                |  23 --
 test_stable_baselines.py                      |  54 ----
 11 files changed, 474 insertions(+), 146 deletions(-)
 create mode 100644 l2rpn_baselines/ppo_stablebaselines/__init__.py
 create mode 100644 l2rpn_baselines/ppo_stablebaselines/evaluate.py
 create mode 100644 l2rpn_baselines/ppo_stablebaselines/train.py
 create mode 100644 l2rpn_baselines/ppo_stablebaselines/utils.py
 delete mode 100644 test_Eva.py
 delete mode 100644 test_eva_dn.py
 delete mode 100644 test_stable_baselines.py

diff --git a/.gitignore b/.gitignore
index 9c2c5ef..d096ca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,3 +166,6 @@ l2rpn_baselines/LeapNetEncoded/tf_logs/
 l2rpn_baselines/LeapNetEncoded/tf_logs_test/
 l2rpn_baselines/LeapNetEncoded/model_test/
 l2rpn_baselines/SACOld/trained_models/
+l2rpn_baselines/ppo_stablebaselines/logs
+l2rpn_baselines/ppo_stablebaselines/saved_baseline
+l2rpn_baselines/ppo_stablebaselines/saved_model
\ No newline at end of file
diff --git a/l2rpn_baselines/Kaist b/l2rpn_baselines/Kaist
index 71c49e7..b2b6561 160000
--- a/l2rpn_baselines/Kaist
+++ b/l2rpn_baselines/Kaist
@@ -1 +1 @@
-Subproject commit 71c49e73ace272fd6d8258a5295abc2b8d3bea1b
+Subproject commit b2b6561a2cc3afbf03fd13ef6d1b334e4ec6c98a
diff --git a/l2rpn_baselines/ppo_stablebaselines/__init__.py b/l2rpn_baselines/ppo_stablebaselines/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/l2rpn_baselines/ppo_stablebaselines/evaluate.py b/l2rpn_baselines/ppo_stablebaselines/evaluate.py
new file mode 100644
index 0000000..62a313f
--- /dev/null
+++ b/l2rpn_baselines/ppo_stablebaselines/evaluate.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import json
+from grid2op.Runner import Runner
+
+from l2rpn_baselines.utils.save_log_gif import save_log_gif
+
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace
+
+from l2rpn_baselines.ppo_stablebaselines.utils import SB3Agent
+
+
+def evaluate(env,
+             load_path=".",
+             name="ppo_stable_baselines",
+             logs_path=None,
+             nb_episode=1,
+             nb_process=1,
+             max_steps=-1,
+             verbose=False,
+             save_gif=False,
+             **kwargs):
+    
+    # load the attributes kept
+    my_path = os.path.join(load_path, name)
+    if not os.path.exists(load_path):
+        os.mkdir(load_path)
+    if not os.path.exists(my_path):
+        os.mkdir(my_path)
+        
+    with open(os.path.join(my_path, "obs_attr_to_keep.json"), encoding="utf-8", mode="r") as f:
+        obs_attr_to_keep = json.load(fp=f)
+    with open(os.path.join(my_path, "act_attr_to_keep.json"), encoding="utf-8", mode="r") as f:
+        act_attr_to_keep = json.load(fp=f)
+
+    # create the action and observation space
+    gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
+    gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    full_path = os.path.join(load_path, name)
+    grid2op_agent = SB3Agent(env.action_space, gym_action_space, gym_observation_space,
+                             nn_path=os.path.join(full_path, name))
+
+    # Build runner
+    runner_params = env.get_params_for_runner()
+    runner_params["verbose"] = verbose
+    runner = Runner(**runner_params,
+                    agentClass=None,
+                    agentInstance=grid2op_agent)
+    
+    # Run the agent on the scenarios
+    if logs_path is not None:
+        os.makedirs(logs_path, exist_ok=True)
+
+    res = runner.run(path_save=logs_path,
+                     nb_episode=nb_episode,
+                     nb_process=nb_process,
+                     max_iter=max_steps,
+                     pbar=verbose,
+                     **kwargs)
+
+    # Print summary
+    if verbose:
+        print("Evaluation summary:")
+        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+            msg_tmp = "chronics at: {}".format(chron_name)
+            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+            print(msg_tmp)
+
+    if save_gif:
+        if verbose:
+            print("Saving the gif of the episodes")
+        save_log_gif(logs_path, res)
+    return grid2op_agent, res
+
+
+if __name__ == "__main__":
+    import grid2op
+    from grid2op.Action import CompleteAction
+    from grid2op.Reward import L2RPNReward, EpisodeDurationReward, LinesCapacityReward
+    from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
+    from lightsim2grid import LightSimBackend
+    from grid2op.Chronics import MultifolderWithCache
+    import pdb
+
+    nb_episode = 7
+    nb_process = 1
+    verbose = True
+
+    env = grid2op.make("educ_case14_storage",
+                       test=True,
+                       action_class=CompleteAction,
+                       reward_class=LinesCapacityReward,
+                       backend=LightSimBackend())
+
+    evaluate(env,
+             nb_episode=nb_episode,
+             load_path="./saved_model", 
+             name="test4",
+             nb_process=1,
+             verbose=verbose,
+             )
+
+    # to compare with do nothing
+    runner_params = env.get_params_for_runner()
+    runner = Runner(**runner_params)
+
+    res = runner.run(nb_episode=nb_episode,
+                     nb_process=nb_process
+                     )
+
+    # Print summary
+    if verbose:
+        print("Evaluation summary for DN:")
+        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+            msg_tmp = "chronics at: {}".format(chron_name)
+            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+            print(msg_tmp)
\ No newline at end of file
diff --git a/l2rpn_baselines/ppo_stablebaselines/train.py b/l2rpn_baselines/ppo_stablebaselines/train.py
new file mode 100644
index 0000000..0fc8c54
--- /dev/null
+++ b/l2rpn_baselines/ppo_stablebaselines/train.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import pdb
+import warnings
+import copy
+import os
+import grid2op
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
+
+import json
+
+from stable_baselines3.common.callbacks import CheckpointCallback
+from stable_baselines3 import PPO
+from stable_baselines3.ppo import MlpPolicy
+
+_default_obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                             "storage_power", "storage_charge"]
+
+_default_act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
+
+
+def train(env,
+          name="ppo_stable_baselines",
+          iterations=1,
+          save_path=None,
+          load_path=None,
+          net_arch=None,
+          logs_dir=None,
+          learning_rate=3e-4,
+          save_every_xxx_steps=None,
+          model_policy=MlpPolicy,
+          obs_attr_to_keep=copy.deepcopy(_default_obs_attr_to_keep),
+          act_attr_to_keep=copy.deepcopy(_default_act_attr_to_keep),
+          **kwargs):
+    """
+    This function will use stable baselines 3 to train a PPO agent on
+    a grid2op environment "env".
+
+    It will use the grid2op "gym_compat" module to convert the action space
+    to a BoxActionSpace and the observation to a BoxObservationSpace.
+
+    It is suited for the studying the impact of continuous actions:
+
+    - on storage units
+    - on dispatchable generators
+    - on generators with renewable energy sources
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+
+    name: ``str```
+        The name of your agent.
+
+    iterations: ``int``
+        For how many iterations (steps) do you want to train your agent. NB these are not episode, these are steps.
+
+    save_path: ``str``
+        Where do you want to save your baseline.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    net_arch:
+        The neural network architecture, used to create the neural network
+        of the PPO (see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html)
+
+    logs_dir: ``str``
+        Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
+
+    learning_rate: ``float``
+        The learning rate, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
+
+    save_every_xxx_steps: ``int``
+        If set (by default it's None) the stable baselines3 model will be saved
+        to the hard drive each `save_every_xxx_steps` steps performed in the
+        environment.
+
+    model_policy: 
+        Type of neural network model trained in stable baseline. By default
+        it's `MlpPolicy`
+
+    obs_attr_to_keep: list of string
+        Grid2op attribute to use to build the BoxObservationSpace. It is passed
+        as the "attr_to_keep" value of the
+        BoxObservation space (see
+        https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymObsSpace)
+
+    act_attr_to_keep: list of string
+        Grid2op attribute to use to build the BoxGymActSpace. It is passed
+        as the "attr_to_keep" value of the
+        BoxAction space (see
+        https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymActSpace)
+
+    verbose: ``bool``
+        If you want something to be printed on the terminal (a better logging strategy will be put at some point)
+
+    kwargs:
+        extra parameters passed to the PPO from stable baselines 3
+
+    Returns
+    -------
+
+    baseline: 
+        The trained baseline as a stable baselines PPO element.
+
+
+    .. _Example-ppo_stable_baseline:
+
+    Examples
+    ---------
+
+    Here is an example on how to train a ppo_stablebaseline .
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import re
+        from grid2op.Reward import LinesCapacityReward  # or any other rewards
+        from lightsim2grid import LightSimBackend  # highly recommended !
+        from grid2op.Chronics import MultifolderWithCache  # highly recommended
+
+        env_name = "l2rpn_case14_sandbox"
+        env = grid2op.make(env_name,
+                           reward_class=LinesCapacityReward,
+                           backend=LightSimBackend(),
+                           chronics_class=MultifolderWithCache)
+
+        env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+        env.chronics_handler.real_data.reset()
+        # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
+        # for more information !
+
+        try:
+            train(env,
+                  iterations=10_000,  # any number of iterations you want
+                  logs_dir="./logs",  # where the tensorboard logs will be put
+                  save_path="./saved_model",  # where the NN weights will be saved
+                  name="test",  # name of the baseline
+                  net_arch=[100, 100, 100],  # architecture of the NN
+                  save_every_xxx_steps=2000,  # save the NN every 2k steps
+                  )
+        finally:
+            env.close()
+
+    """
+    if act_attr_to_keep == _default_act_attr_to_keep:
+        # by default, i remove all the attributes that are not supported by the action type
+        # i do not do that if the user specified specific attributes to keep. This is his responsibility in
+        # in this case
+        modif_attr = []
+        for el in act_attr_to_keep:
+            if env.action_space.supports_type(el):
+                modif_attr.append(el)
+            else:
+                warnings.warn(f"attribute {el} cannot be processed by the allowed "
+                              "action type. It has been removed from the "
+                              "gym space as well.")
+        act_attr_to_keep = modif_attr
+
+    if save_path is not None:
+        # save the attributes kept
+        my_path = os.path.join(save_path, name)
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+        if not os.path.exists(my_path):
+            os.mkdir(my_path)
+
+        with open(os.path.join(my_path, "obs_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
+            json.dump(fp=f, obj=obs_attr_to_keep)
+        with open(os.path.join(my_path, "act_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
+            json.dump(fp=f, obj=act_attr_to_keep)
+
+    # define the gym environment from the grid2op env
+    env_gym = GymEnv(env)
+    env_gym.observation_space.close()
+    env_gym.observation_space =  BoxGymObsSpace(env.observation_space,
+                                                attr_to_keep=obs_attr_to_keep)
+    env_gym.action_space.close()
+    env_gym.action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+
+
+    # Save a checkpoint every 1000 steps
+    checkpoint_callback = None
+    if save_every_xxx_steps is not None:
+        if save_path is None:
+            warnings.warn("save_every_xxx_steps is set, but no path are "
+                          "set to save the model (save_path is None). No model "
+                          "will be saved.")
+        else:
+            checkpoint_callback = CheckpointCallback(save_freq=save_every_xxx_steps,
+                                                     save_path=my_path,
+                                                     name_prefix=name)
+
+    # define the policy
+    if load_path is None:
+        policy_kwargs = {}
+        if net_arch is not None:
+            policy_kwargs["net_arch"] = net_arch
+        if logs_dir is not None:
+            if not os.path.exists(logs_dir):
+                os.mkdir(logs_dir)
+        model = PPO(model_policy,
+                    env_gym,
+                    verbose=1,
+                    learning_rate=learning_rate,
+                    tensorboard_log=os.path.join(logs_dir, name),
+                    policy_kwargs=policy_kwargs,
+                    **kwargs)
+    else:
+        # TODO !
+        model = PPO.load(os.path.join(load_path, name))
+
+    # train it
+    model.learn(total_timesteps=iterations,
+                callback=checkpoint_callback)
+
+    # save it
+    if save_path is not None:
+        model.save(os.path.join(my_path, name))
+
+    env_gym.close()
+
+if __name__ == "__main__":
+
+    import re
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                       reward_class=LinesCapacityReward,
+                       backend=LightSimBackend(),
+                       chronics_class=MultifolderWithCache)
+
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+    env.chronics_handler.real_data.reset()
+    # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
+    # for more information !
+
+    train(env,
+          iterations=10_000,
+          logs_dir="./logs",
+          save_path="./saved_model", 
+          name="test",
+          net_arch=[100, 100, 100],
+          save_every_xxx_steps=2000,
+          )
+
+
+    # from grid2op.Action import CompleteAction
+    # from grid2op.Reward import LinesCapacityReward
+    # from lightsim2grid import LightSimBackend
+    # from grid2op.Chronics import MultifolderWithCache
+
+    # env = grid2op.make("educ_case14_storage",
+    #                    test=True,
+    #                    action_class=CompleteAction,
+    #                    reward_class=LinesCapacityReward,
+    #                    backend=LightSimBackend(),
+    #                    chronics_class=MultifolderWithCache)
+
+    # env.chronics_handler.real_data.set_filter(lambda x: True)
+    # env.chronics_handler.real_data.reset()
+
+    # train(env,
+    #       iterations=10_000,
+    #       logs_dir="./logs",
+    #       save_path="./saved_model", 
+    #       name="test4",
+    #       net_arch=[100, 100, 100],
+    #       save_every_xxx_steps=2000,
+    #       )
diff --git a/l2rpn_baselines/ppo_stablebaselines/utils.py b/l2rpn_baselines/ppo_stablebaselines/utils.py
new file mode 100644
index 0000000..6268e34
--- /dev/null
+++ b/l2rpn_baselines/ppo_stablebaselines/utils.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+
+from l2rpn_baselines.utils import GymAgent
+
+from stable_baselines3 import PPO
+
+class SB3Agent(GymAgent):
+    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path, nn_type=PPO):
+        self._nn_type = nn_type
+        super().__init__(g2op_action_space, gym_act_space, gym_obs_space, nn_path)
+        
+    def get_act(self, gym_obs, reward, done):
+        action, _ = self.nn_model.predict(gym_obs, deterministic=True)
+        return action
+
+    def load(self):
+        """
+        Load the NN models
+        """
+        self.nn_model = self._nn_type.load(self._nn_path)
diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index 6d02183..f3ea131 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -19,7 +19,8 @@
            "NNParam",
            "ReplayBuffer",
            "BaseDeepQ",
-           "DeepQAgent"
+           "DeepQAgent",
+           "GymAgent"
 ]
 
 from l2rpn_baselines.utils.cli_eval import cli_eval
@@ -35,3 +36,4 @@
 from l2rpn_baselines.utils.ReplayBuffer import ReplayBuffer
 from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ
 from l2rpn_baselines.utils.DeepQAgent import DeepQAgent
+from l2rpn_baselines.utils.gymAgent import GymAgent
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index c5b4ba2..803f545 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
 # See AUTHORS.txt
 # This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
 # If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
@@ -6,6 +6,8 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+from abc import abstractmethod
+
 from grid2op.Agent import BaseAgent
 
 
@@ -16,11 +18,34 @@ class GymAgent(BaseAgent):
     It can then be used as a "regular" grid2op agent, in a runner, grid2viz, grid2game etc.
 
     It is also compatible with the "l2rpn baselines" interface.
+
+    Use it only with a trained agent. It does not provide the "save" method and
+    is not suitable for training.
     """
     def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path):
         super().__init__(g2op_action_space)
         self._gym_act_space = gym_act_space
         self._gym_obs_space = gym_obs_space
         self._nn_path = nn_path
+        self.nn_model = None
+        self.load()
+
+    @abstractmethod
+    def get_act(self, gym_obs, reward, done):
+        """
+        retrieve the action from the NN model
+        """
+        pass
+
+    @abstractmethod
+    def load(self):
+        """
+        Load the NN models
+        """
+        pass
 
-    
\ No newline at end of file
+    def act(self, observation, reward, done):
+        gym_obs = self._gym_obs_space.to_gym(observation)
+        gym_act = self.get_act(gym_obs, reward, done)
+        grid2op_act = self._gym_act_space.from_gym(gym_act)
+        return grid2op_act
diff --git a/test_Eva.py b/test_Eva.py
deleted file mode 100644
index 4508cae..0000000
--- a/test_Eva.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import json
-import os
-import grid2op
-import re
-from grid2op.Reward import L2RPNReward, EpisodeDurationReward
-from l2rpn_baselines.utils import TrainingParam, NNParam
-from l2rpn_baselines.DuelQSimple import train
-from lightsim2grid import LightSimBackend
-from grid2op.Chronics import MultifolderWithCache
-
-# define the environment
-env = grid2op.make("l2rpn_case14_sandbox",
-                    reward_class=EpisodeDurationReward,
-                    backend=LightSimBackend(),
-                    chronics_class=MultifolderWithCache)
-
-env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
-env.chronics_handler.real_data.reset()
-
-# use the default training parameters
-tp = TrainingParam()
-
-# this will be the list of what part of the observation I want to keep
-# more information on https://grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes
-li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                 "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                 "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
-
-# neural network architecture
-observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
-sizes = [800, 494, 494]  # sizes of each hidden layers
-kwargs_archi = {'observation_size': observation_size,
-                'sizes': sizes,
-                'activs': ["relu" for _ in sizes],  # all relu activation function
-                "list_attr_obs": li_attr_obs_X}
-li_act_path = "line_act.json"
-if os.path.exists(li_act_path):
-    with open(li_act_path, "r", encoding="utf-8") as f:
-        all_acts = json.load(f)
-else:
-    all_acts = [env.action_space().as_serializable_dict()]
-    for el in range(env.n_line):
-        all_acts.append(env.action_space({"set_line_status" : [(el, -1)]}).as_serializable_dict())
-        all_acts.append(env.action_space({"set_line_status" : [(el, +1)]}).as_serializable_dict())
-    
-    with open(li_act_path, "w", encoding="utf-8") as f:
-       json.dump(fp=f, obj=all_acts)
-
-# select some part of the action
-# more information at https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.init_converter
-kwargs_converters = {"all_actions": all_acts }
-# define the name of the model
-nm_ = "AnneOnymous6"
-try:
-    train(env,
-          name=nm_,
-          iterations=1_000_000,
-          save_path="./saved_agents",
-          load_path=None,
-          logs_dir="./logs",
-          training_param=tp,
-          kwargs_converters=kwargs_converters,
-          kwargs_archi=kwargs_archi)
-finally:
-    env.close()
diff --git a/test_eva_dn.py b/test_eva_dn.py
deleted file mode 100644
index 6c224d5..0000000
--- a/test_eva_dn.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import pdb
-import grid2op
-import re
-import os
-from grid2op.Reward import L2RPNReward
-from grid2op.Runner import Runner
-from lightsim2grid import LightSimBackend
-from grid2op.Chronics import MultifolderWithCache
-
-# define the environment
-env = grid2op.make("l2rpn_case14_sandbox",
-                    reward_class=L2RPNReward,
-                    backend=LightSimBackend(),
-                    # chronics_class=MultifolderWithCache
-                    )
-
-# env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
-# env.chronics_handler.real_data.reset()
-runner = Runner(**env.get_params_for_runner())
-res = runner.run(nb_episode=10, episode_id=["0000", "0100", "0200", "0300", "0400", "0500", "0600", "0700", "0800", "0900"])
-pdb.set_trace()
-{'0000': 1091, '0100': 1097, '0300': 1096,  '0400': 2828, '0500': 514, '0600': 1091, '0700': 717, '0800': 513, '0900': 381}
-# mean time survived: 1036.4444444444443
\ No newline at end of file
diff --git a/test_stable_baselines.py b/test_stable_baselines.py
deleted file mode 100644
index b11d28c..0000000
--- a/test_stable_baselines.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import gym
-import json
-import os
-import grid2op
-import re
-from grid2op.Reward import L2RPNReward, EpisodeDurationReward
-from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
-from l2rpn_baselines.DuelQSimple import train
-from lightsim2grid import LightSimBackend
-from grid2op.Chronics import MultifolderWithCache
-from stable_baselines3 import PPO
-
-# define the environment
-env = grid2op.make("l2rpn_case14_sandbox",
-                    reward_class=EpisodeDurationReward,
-                    backend=LightSimBackend(),
-                    chronics_class=MultifolderWithCache)
-
-env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
-env.chronics_handler.real_data.reset()
-env_gym = GymEnv(env)
-
-li_act_path = "line_act.json"
-if os.path.exists(li_act_path):
-    with open(li_act_path, "r", encoding="utf-8") as f:
-        all_acts = json.load(f)
-else:
-    all_acts = [env.action_space().as_serializable_dict()]
-    for el in range(env.n_line):
-        all_acts.append(env.action_space({"set_line_status" : [(el, -1)]}).as_serializable_dict())
-        all_acts.append(env.action_space({"set_line_status" : [(el, +1)]}).as_serializable_dict())
-
-env_gym.action_space = DiscreteActSpace(env.action_space, action_list= all_acts)
-
-li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                 "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                 "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
-env_gym.observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=li_attr_obs_X)
-
-# learn
-model = PPO("MlpPolicy", env_gym, verbose=1, tensorboard_log="./logs")
-model.learn(total_timesteps=100_000)
-
-
-# test
-obs = env_gym.reset()
-for i in range(1000):
-    action, _states = model.predict(obs, deterministic=True)
-    obs, reward, done, info = env_gym.step(action)
-    if done:
-        print(f"{reward=}")
-        obs = env_gym.reset()
-
-env.close()
\ No newline at end of file

From d6a913afc68c088ca8081bdced275175e5725bed Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 24 Jan 2022 18:46:19 +0100
Subject: [PATCH 05/56] adding a bit of doc, need still tests and packaging

---
 .gitignore                                    |   5 +-
 .../ppo_stablebaselines/__init__.py           |   8 +
 .../ppo_stablebaselines/evaluate.py           | 261 +++++++++++++++---
 l2rpn_baselines/ppo_stablebaselines/train.py  |   5 +-
 4 files changed, 238 insertions(+), 41 deletions(-)

diff --git a/.gitignore b/.gitignore
index d096ca0..d3d3583 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,4 +168,7 @@ l2rpn_baselines/LeapNetEncoded/model_test/
 l2rpn_baselines/SACOld/trained_models/
 l2rpn_baselines/ppo_stablebaselines/logs
 l2rpn_baselines/ppo_stablebaselines/saved_baseline
-l2rpn_baselines/ppo_stablebaselines/saved_model
\ No newline at end of file
+l2rpn_baselines/ppo_stablebaselines/saved_model
+test_Eva.py
+test_box_act.py
+test_eva_dn.py
diff --git a/l2rpn_baselines/ppo_stablebaselines/__init__.py b/l2rpn_baselines/ppo_stablebaselines/__init__.py
index e69de29..98c2da4 100644
--- a/l2rpn_baselines/ppo_stablebaselines/__init__.py
+++ b/l2rpn_baselines/ppo_stablebaselines/__init__.py
@@ -0,0 +1,8 @@
+__all__ = [
+    "evaluate",
+    "train"
+]
+
+# from l2rpn_baselines.ppo_stablebaselines.DuelQSimple import DuelQSimple
+from l2rpn_baselines.ppo_stablebaselines.evaluate import evaluate
+from l2rpn_baselines.ppo_stablebaselines.train import train
diff --git a/l2rpn_baselines/ppo_stablebaselines/evaluate.py b/l2rpn_baselines/ppo_stablebaselines/evaluate.py
index 62a313f..f414f7d 100644
--- a/l2rpn_baselines/ppo_stablebaselines/evaluate.py
+++ b/l2rpn_baselines/ppo_stablebaselines/evaluate.py
@@ -27,7 +27,116 @@ def evaluate(env,
              verbose=False,
              save_gif=False,
              **kwargs):
+    """
+    This function will use stable baselines 3 to train a PPO agent on
+    a grid2op environment "env".
+
+    It will use the grid2op "gym_compat" module to convert the action space
+    to a BoxActionSpace and the observation to a BoxObservationSpace.
+
+    It is suited for the studying the impact of continuous actions:
+
+    - on storage units
+    - on dispatchable generators
+    - on generators with renewable energy sources
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+
+    name: ``str```
+        The name of your agent.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    logs_dir: ``str``
+        Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
     
+    nb_episode: ``str``
+        How many episodes to run during the assessment of the performances
+
+    nb_process: ``int``
+        On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be
+        unstable on some plaform)
+
+    max_steps: ``int``
+        How many steps at maximum your agent will be assessed
+
+    verbose: ``bool``
+        Currently un used
+
+    save_gif: ``bool``
+        Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
+        take a lot of ram) and drastically increase computation time.
+
+    kwargs:
+        extra parameters passed to the PPO from stable baselines 3
+
+    Returns
+    -------
+
+    baseline: 
+        The loaded baseline as a stable baselines PPO element.
+
+    Examples
+    ---------
+
+    Here is an example on how to train a ppo_stablebaseline .
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.Reward import LinesCapacityReward  # or any other rewards
+        from grid2op.Chronics import MultifolderWithCache  # highly recommended
+        from lightsim2grid import LightSimBackend  # highly recommended !
+        from l2rpn_baselines.ppo_stablebaselines import evaluate
+
+        nb_episode = 7
+        nb_process = 1
+        verbose = True
+
+        env_name = "l2rpn_case14_sandbox"
+        env = grid2op.make(env_name,
+                           reward_class=LinesCapacityReward,
+                           backend=LightSimBackend()
+                           )
+
+        try:
+            evaluate(env,
+                    nb_episode=nb_episode,
+                    load_path="./saved_model",  # should be the same as what has been called in the train function !
+                    name="test",  # should be the same as what has been called in the train function !
+                    nb_process=1,
+                    verbose=verbose,
+                    )
+
+            # you can also compare your agent with the do nothing agent relatively
+            # easily
+            runner_params = env.get_params_for_runner()
+            runner = Runner(**runner_params)
+
+            res = runner.run(nb_episode=nb_episode,
+                            nb_process=nb_process
+                            )
+
+            # Print summary
+            if verbose:
+                print("Evaluation summary for DN:")
+                for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                    msg_tmp = "chronics at: {}".format(chron_name)
+                    msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                    msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                    print(msg_tmp)
+        finally:
+            env.close()
+
+    """
+
     # load the attributes kept
     my_path = os.path.join(load_path, name)
     if not os.path.exists(load_path):
@@ -84,45 +193,119 @@ def evaluate(env,
 
 
 if __name__ == "__main__":
-    import grid2op
-    from grid2op.Action import CompleteAction
-    from grid2op.Reward import L2RPNReward, EpisodeDurationReward, LinesCapacityReward
-    from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
-    from lightsim2grid import LightSimBackend
-    from grid2op.Chronics import MultifolderWithCache
-    import pdb
-
-    nb_episode = 7
-    nb_process = 1
-    verbose = True
-
-    env = grid2op.make("educ_case14_storage",
-                       test=True,
-                       action_class=CompleteAction,
-                       reward_class=LinesCapacityReward,
-                       backend=LightSimBackend())
-
-    evaluate(env,
-             nb_episode=nb_episode,
-             load_path="./saved_model", 
-             name="test4",
-             nb_process=1,
-             verbose=verbose,
-             )
 
-    # to compare with do nothing
-    runner_params = env.get_params_for_runner()
-    runner = Runner(**runner_params)
+        import grid2op
+        from grid2op.Reward import LinesCapacityReward  # or any other rewards
+        from grid2op.Chronics import MultifolderWithCache  # highly recommended
+        from lightsim2grid import LightSimBackend  # highly recommended !
+        # from l2rpn_baselines.ppo_stablebaselines import evaluate
 
-    res = runner.run(nb_episode=nb_episode,
-                     nb_process=nb_process
-                     )
+        nb_episode = 7
+        nb_process = 1
+        verbose = True
 
-    # Print summary
-    if verbose:
-        print("Evaluation summary for DN:")
-        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
-            msg_tmp = "chronics at: {}".format(chron_name)
-            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
-            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
-            print(msg_tmp)
\ No newline at end of file
+        env_name = "l2rpn_case14_sandbox"
+        env = grid2op.make(env_name,
+                           reward_class=LinesCapacityReward,
+                           backend=LightSimBackend()
+                           )
+
+        try:
+            evaluate(env,
+                    nb_episode=nb_episode,
+                    load_path="./saved_model", 
+                    name="test",
+                    nb_process=1,
+                    verbose=verbose,
+                    )
+
+            # you can also compare your agent with the do nothing agent relatively
+            # easily
+            runner_params = env.get_params_for_runner()
+            runner = Runner(**runner_params)
+
+            res = runner.run(nb_episode=nb_episode,
+                            nb_process=nb_process
+                            )
+
+            # Print summary
+            if verbose:
+                print("Evaluation summary for DN:")
+                for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                    msg_tmp = "chronics at: {}".format(chron_name)
+                    msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                    msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                    print(msg_tmp)
+        finally:
+            env.close()
+
+    # import re
+    # from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    # from lightsim2grid import LightSimBackend  # highly recommended !
+    # from grid2op.Chronics import MultifolderWithCache  # highly recommended
+
+    # env_name = "l2rpn_case14_sandbox"
+    # env = grid2op.make(env_name,
+    #                     reward_class=LinesCapacityReward,
+    #                     backend=LightSimBackend(),
+    #                     chronics_class=MultifolderWithCache)
+
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+    # env.chronics_handler.real_data.reset()
+    # # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
+    # # for more information !
+
+    # try:
+    #     train(env,
+    #             iterations=10_000,  # any number of iterations you want
+    #             logs_dir="./logs",  # where the tensorboard logs will be put
+    #             save_path="./saved_model",  # where the NN weights will be saved
+    #             name="test",  # name of the baseline
+    #             net_arch=[100, 100, 100],  # architecture of the NN
+    #             save_every_xxx_steps=2000,  # save the NN every 2k steps
+    #             )
+    # finally:
+    #     env.close()
+
+    # import grid2op
+    # from grid2op.Action import CompleteAction
+    # from grid2op.Reward import L2RPNReward, EpisodeDurationReward, LinesCapacityReward
+    # from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
+    # from lightsim2grid import LightSimBackend
+    # from grid2op.Chronics import MultifolderWithCache
+    # import pdb
+
+    # nb_episode = 7
+    # nb_process = 1
+    # verbose = True
+
+    # env = grid2op.make("educ_case14_storage",
+    #                    test=True,
+    #                    action_class=CompleteAction,
+    #                    reward_class=LinesCapacityReward,
+    #                    backend=LightSimBackend())
+
+    # evaluate(env,
+    #          nb_episode=nb_episode,
+    #          load_path="./saved_model", 
+    #          name="test4",
+    #          nb_process=1,
+    #          verbose=verbose,
+    #          )
+
+    # # to compare with do nothing
+    # runner_params = env.get_params_for_runner()
+    # runner = Runner(**runner_params)
+
+    # res = runner.run(nb_episode=nb_episode,
+    #                  nb_process=nb_process
+    #                  )
+
+    # # Print summary
+    # if verbose:
+    #     print("Evaluation summary for DN:")
+    #     for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+    #         msg_tmp = "chronics at: {}".format(chron_name)
+    #         msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+    #         msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+    #         print(msg_tmp)
diff --git a/l2rpn_baselines/ppo_stablebaselines/train.py b/l2rpn_baselines/ppo_stablebaselines/train.py
index 0fc8c54..edb93a8 100644
--- a/l2rpn_baselines/ppo_stablebaselines/train.py
+++ b/l2rpn_baselines/ppo_stablebaselines/train.py
@@ -127,9 +127,11 @@ def train(env,
     .. code-block:: python
 
         import re
+        import grid2op
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
-        from lightsim2grid import LightSimBackend  # highly recommended !
         from grid2op.Chronics import MultifolderWithCache  # highly recommended
+        from lightsim2grid import LightSimBackend  # highly recommended !
+        from l2rpn_baselines.ppo_stablebaselines import train
 
         env_name = "l2rpn_case14_sandbox"
         env = grid2op.make(env_name,
@@ -235,6 +237,7 @@ def train(env,
 if __name__ == "__main__":
 
     import re
+    import grid2op
     from grid2op.Reward import LinesCapacityReward  # or any other rewards
     from lightsim2grid import LightSimBackend  # highly recommended !
     from grid2op.Chronics import MultifolderWithCache  # highly recommended

From d7945d768faa62a2d48c84abe43fdbe71da71e19 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 15:57:38 +0100
Subject: [PATCH 06/56] fix the imports for the online documentation

---
 docs/DeepQSimple.rst                          |   2 +
 docs/DoubleDuelingDQN.rst                     |   2 +
 docs/DoubleDuelingRDQN.rst                    |   2 +
 docs/DuelQLeapNet.rst                         |   2 +
 docs/DuelQSimple.rst                          |   2 +
 docs/ExpertAgent.rst                          |   2 +
 docs/LeapNetEncoded.rst                       |   2 +
 docs/SACOld.rst                               |   2 +
 docs/conf.py                                  |   2 +-
 docs/index.rst                                |   1 +
 docs/ppo_stable_baselines.rst                 | 115 ++++++++++++++++++
 docs/template.rst                             |   2 +
 l2rpn_baselines/DeepQSimple/DeepQ_NN.py       |  17 ++-
 l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py  |   3 +-
 l2rpn_baselines/DeepQSimple/evaluate.py       |   4 +-
 l2rpn_baselines/DeepQSimple/train.py          |   3 +-
 .../DoubleDuelingDQN/DoubleDuelingDQN.py      |  10 +-
 .../DoubleDuelingDQNConfig.py                 |   1 -
 .../DoubleDuelingDQN/DoubleDuelingDQN_NN.py   |  17 ++-
 l2rpn_baselines/DoubleDuelingDQN/evaluate.py  |   3 +-
 .../DoubleDuelingDQN/inspect_action_space.py  |   3 +-
 l2rpn_baselines/DoubleDuelingDQN/train.py     |   4 +-
 .../DoubleDuelingRDQN/DoubleDuelingRDQN.py    |  12 +-
 .../DoubleDuelingRDQNConfig.py                |   9 +-
 .../DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py |  22 ++--
 .../DoubleDuelingRDQN/ExperienceBuffer.py     |   1 -
 l2rpn_baselines/DoubleDuelingRDQN/evaluate.py |   3 +-
 l2rpn_baselines/DoubleDuelingRDQN/train.py    |   4 +-
 .../DuelQLeapNet/DuelQLeapNet_NN.py           |  59 ++++-----
 .../DuelQLeapNet/LeapNet_NNParam.py           |   3 +-
 l2rpn_baselines/DuelQLeapNet/evaluate.py      |   7 +-
 l2rpn_baselines/DuelQLeapNet/train.py         |   3 +-
 l2rpn_baselines/DuelQSimple/DuelQ_NN.py       |  19 ++-
 l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py  |   2 +-
 l2rpn_baselines/DuelQSimple/evaluate.py       |   5 +-
 l2rpn_baselines/DuelQSimple/train.py          |   2 +-
 l2rpn_baselines/ExpertAgent/ExpertAgent.py    |  23 ++--
 l2rpn_baselines/ExpertAgent/evaluate.py       |   9 +-
 .../LeapNetEncoded/LeapNetEncoded_NN.py       |  31 ++---
 l2rpn_baselines/LeapNetEncoded/evaluate.py    |   6 +-
 l2rpn_baselines/LeapNetEncoded/study.py       |   4 +-
 l2rpn_baselines/LeapNetEncoded/train.py       |   4 +-
 l2rpn_baselines/SACOld/SACOld_NN.py           |  20 +--
 l2rpn_baselines/SACOld/evaluate.py            |   4 +-
 l2rpn_baselines/SACOld/train.py               |   2 +-
 l2rpn_baselines/SliceRDQN/ExperienceBuffer.py |   1 -
 l2rpn_baselines/SliceRDQN/SliceRDQN.py        |  11 +-
 l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py |   8 ++
 l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py     |  23 ++--
 l2rpn_baselines/SliceRDQN/evaluate.py         |   6 +-
 l2rpn_baselines/SliceRDQN/slice_util.py       |   8 ++
 l2rpn_baselines/SliceRDQN/train.py            |   4 +-
 .../ppo_stablebaselines/__init__.py           |   5 +-
 l2rpn_baselines/ppo_stablebaselines/train.py  |  59 ++++-----
 l2rpn_baselines/ppo_stablebaselines/utils.py  |  60 ++++++++-
 l2rpn_baselines/utils/BaseDeepQ.py            |  14 ++-
 l2rpn_baselines/utils/DeepQAgent.py           |   9 +-
 l2rpn_baselines/utils/gymAgent.py             |  87 ++++++++++++-
 l2rpn_baselines/utils/save_log_gif.py         |  12 +-
 setup.py                                      |   3 +-
 60 files changed, 555 insertions(+), 210 deletions(-)
 create mode 100644 docs/ppo_stable_baselines.rst

diff --git a/docs/DeepQSimple.rst b/docs/DeepQSimple.rst
index b572462..d3031a7 100644
--- a/docs/DeepQSimple.rst
+++ b/docs/DeepQSimple.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.DeepQSimple
+
 DeepQSimple: A simple implementation of the Deep Q Learning
 ===========================================================
 
diff --git a/docs/DoubleDuelingDQN.rst b/docs/DoubleDuelingDQN.rst
index df13a60..2420a82 100644
--- a/docs/DoubleDuelingDQN.rst
+++ b/docs/DoubleDuelingDQN.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.DoubleDuelingDQN
+
 DoubleDuelingDQN: A example implementation of Double Duelling Deep Q Network
 ============================================================================
 
diff --git a/docs/DoubleDuelingRDQN.rst b/docs/DoubleDuelingRDQN.rst
index c286e1f..b1e280f 100644
--- a/docs/DoubleDuelingRDQN.rst
+++ b/docs/DoubleDuelingRDQN.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.DoubleDuelingRDQN
+
 DoubleDuelingRDQN: A example implementation of Recurrent DoubleQ Network
 ========================================================================
 
diff --git a/docs/DuelQLeapNet.rst b/docs/DuelQLeapNet.rst
index 37dfdbd..59f614d 100644
--- a/docs/DuelQLeapNet.rst
+++ b/docs/DuelQLeapNet.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.DuelQLeapNet
+
 DuelQLeapNet: D3QN with LeapNet
 ================================
 
diff --git a/docs/DuelQSimple.rst b/docs/DuelQSimple.rst
index 947a727..436456f 100644
--- a/docs/DuelQSimple.rst
+++ b/docs/DuelQSimple.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.DuelQSimple
+
 DuelQSimple: Double Duelling Deep Q Learning
 =============================================
 
diff --git a/docs/ExpertAgent.rst b/docs/ExpertAgent.rst
index 0bb8c6c..c9e8ebd 100644
--- a/docs/ExpertAgent.rst
+++ b/docs/ExpertAgent.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.ExpertAgent
+
 ExpertAgent: A example implementation of using ExpertOpForGrid for empirical overflow solving
 =============================================================================================
 
diff --git a/docs/LeapNetEncoded.rst b/docs/LeapNetEncoded.rst
index 92a3674..e256d72 100644
--- a/docs/LeapNetEncoded.rst
+++ b/docs/LeapNetEncoded.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.LeapNetEncoded
+
 LeapNetEncoded: D3QN on a state encoded by a leap net
 ======================================================
 
diff --git a/docs/SACOld.rst b/docs/SACOld.rst
index 3cf6237..5147342 100644
--- a/docs/SACOld.rst
+++ b/docs/SACOld.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.SACOld
+
 SAC: Soft Actor Critic
 =========================
 
diff --git a/docs/conf.py b/docs/conf.py
index eae6bd0..d2f52ef 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -74,6 +74,6 @@
 html_static_path = ['_static']
 
 def setup(app):
-  app.add_javascript('custom.js')
+  # app.add_javascript('custom.js')
   if app.config.language == 'ja':
         app.config.intersphinx_mapping['py'] = ('https://docs.python.org/ja/3', None)
diff --git a/docs/index.rst b/docs/index.rst
index 10b2a1a..4b97742 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -29,6 +29,7 @@ Baseline already Available
    DoubleDuelingDQN
    DuelQSimple
    ExpertAgent
+   ppo_stable_baselines
 
 
 More advanced baselines
diff --git a/docs/ppo_stable_baselines.rst b/docs/ppo_stable_baselines.rst
new file mode 100644
index 0000000..f3f216e
--- /dev/null
+++ b/docs/ppo_stable_baselines.rst
@@ -0,0 +1,115 @@
+.. currentmodule:: l2rpn_baselines.ppo_stablebaselines
+
+PPO Stable Baselines
+===========================================================
+
+Description
+-----------
+This "baseline" aims at providing a code example on how to use an agent
+from the Sable Baselines repository (see https://stable-baselines3.readthedocs.io/en/master/)
+with grid2op.
+
+It also serve a second goal, to show how to train a PPO agent to perform
+continuous actions on the powergrid (*eg* adjusting the generator value, either
+by applying `redispatching` kind of action for controlable generators or 
+by with `curtailment` on generator using new renewable energy sources - solar and wind
+or even to control the state of the storage units.)
+
+
+Exported class
+--------------
+You can use this class with:
+
+.. code-block:: python
+
+    from l2rpn_baselines.ppo_stablebaselines import train, evaluate, PPOSB_Agent
+
+
+Create an agent from scratch
+++++++++++++++++++++++++++++++
+
+For example, to create an agent from scratch, with some parameters:
+
+.. code-block:: python
+
+    import grid2op
+    from grid2op.gym_compat import GymEnv, BoxGymActSpace
+
+    # create the grid2op environment
+    env = grid2op.make(...)
+    #############
+
+    # convert it to a suitable gym environment
+    env_gym = GymEnv(env)
+    env_gym.action_space.close()
+    env_gym.action_space = BoxGymActSpace(env.action_space)
+    #############
+
+    # create the PPO Stable Baselines agent (only some basic configs are given here)
+    agent = PPOSB_Agent(env.action_space,
+                        env_gym.action_space,
+                        env_gym.observation_space,
+                        nn_kwargs={
+                            "policy": MlpPolicy,  # or any other stable baselines 3 policy
+                            "env": env_gym,
+                            "verbose": 1,  # or anything else
+                            "learning_rate": 3e-4,  # you can change that
+                            "policy_kwargs": {
+                                "net_arch": [100, 100, 100]  # and that
+                            }
+                        },
+                        nn_path=None
+                        )
+
+.. note::
+    The agent above is NOT trained. So it will basically output "random" actions.
+
+    You should probably train it before hand (see the `train` function)
+
+Load a trained agent
++++++++++++++++++++++++
+You can also load a trained agent, to use it with a grid2op environment, in a runner,
+in grid2game or any other frameworks related to grid2op.
+
+
+.. code-block:: python
+
+    import grid2op
+    from grid2op.gym_compat import GymEnv, BoxGymActSpace
+
+    # create the grid2op environment
+    env = grid2op.make(...)
+    #############
+
+    # convert it to a suitable gym environment
+    env_gym = GymEnv(env)
+    env_gym.action_space.close()
+    env_gym.action_space = BoxGymActSpace(env.action_space)
+    #############
+
+    # create the PPO Stable Baselines agent (only some basic configs are given here)
+    agent = PPOSB_Agent(env.action_space,
+                        env_gym.action_space,
+                        env_gym.observation_space,
+                        nn_path=...  # path where you saved it !
+                        )
+
+
+Detailed documentation
+++++++++++++++++++++++++
+
+.. automodule:: l2rpn_baselines.ppo_stablebaselines
+    :members:
+    :autosummary:
+
+Other non exported class
+------------------------
+These classes need to be imported, if you want to import them with (non exhaustive list):
+.. code-block:: python
+
+    from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
+
+
+.. autoclass:: l2rpn_baselines.DeepQSimple.DeepQ_NN.DeepQ_NN
+    :members:
+    :autosummary:
diff --git a/docs/template.rst b/docs/template.rst
index e274088..2e8842f 100644
--- a/docs/template.rst
+++ b/docs/template.rst
@@ -1,3 +1,5 @@
+.. currentmodule:: l2rpn_baselines.Template
+
 Template: How to contribute to l2rpn baselines
 ==============================================
 
diff --git a/l2rpn_baselines/DeepQSimple/DeepQ_NN.py b/l2rpn_baselines/DeepQSimple/DeepQ_NN.py
index 53a5249..f4e069a 100644
--- a/l2rpn_baselines/DeepQSimple/DeepQ_NN.py
+++ b/l2rpn_baselines/DeepQSimple/DeepQ_NN.py
@@ -9,11 +9,15 @@
 # tf2.0 friendly
 import warnings
 
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    from tensorflow.keras.models import Sequential, Model
-    from tensorflow.keras.layers import Activation, Dense
-    from tensorflow.keras.layers import Input
+try:
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from tensorflow.keras.models import Sequential, Model
+        from tensorflow.keras.layers import Activation, Dense
+        from tensorflow.keras.layers import Input
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
 
@@ -31,6 +35,9 @@ class DeepQ_NN(BaseDeepQ):
     def __init__(self,
                  nn_params,
                  training_param=None):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         if training_param is None:
             training_param = TrainingParam()
         BaseDeepQ.__init__(self,
diff --git a/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py b/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py
index 973d61e..8a7c0b6 100644
--- a/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py
+++ b/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py
@@ -5,7 +5,6 @@
 # you can obtain one at http://mozilla.org/MPL/2.0/.
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
-import os
 import copy
 
 from l2rpn_baselines.utils import NNParam
@@ -15,7 +14,7 @@
 class DeepQ_NNParam(NNParam):
     """
     This defined the specific parameters for the DeepQ network. Nothing really different compared to the base class
-    except that :attr:`l2rpn_baselines.NNParam.nn_class` is :class:`DeepQ_NN`
+    except that :attr:`l2rpn_baselines.utils.NNParam.nn_class` is :class:`DeepQ_NN`
 
     """
     _int_attr = copy.deepcopy(NNParam._int_attr)
diff --git a/l2rpn_baselines/DeepQSimple/evaluate.py b/l2rpn_baselines/DeepQSimple/evaluate.py
index a8a90fd..0f098ba 100644
--- a/l2rpn_baselines/DeepQSimple/evaluate.py
+++ b/l2rpn_baselines/DeepQSimple/evaluate.py
@@ -9,12 +9,9 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.DeepQSimple.DeepQSimple import DeepQSimple, DEFAULT_NAME
@@ -110,6 +107,7 @@ def evaluate(env,
 
     """
 
+    import tensorflow as tf  # lazy import to save import time
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/DeepQSimple/train.py b/l2rpn_baselines/DeepQSimple/train.py
index 5e2a322..d76e3fd 100755
--- a/l2rpn_baselines/DeepQSimple/train.py
+++ b/l2rpn_baselines/DeepQSimple/train.py
@@ -9,7 +9,6 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 import warnings
 
 from l2rpn_baselines.utils import cli_train
@@ -140,7 +139,7 @@ def train(env,
             env.close()
 
     """
-
+    import tensorflow as tf  # lazy import to save import time
     # Limit gpu usage
     try:
         physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py
index a8ce8b8..ea88fa2 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py
@@ -10,7 +10,12 @@
 import json
 import math
 import numpy as np
-import tensorflow as tf
+try:
+    import tensorflow as tf
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+    
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
@@ -25,6 +30,9 @@ def __init__(self,
                  action_space,
                  name=__name__,
                  is_training=False):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         # Call parent constructor
         AgentWithConverter.__init__(self, action_space,
                                     action_space_converter=IdToAct)
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py
index 3f67c26..94d38af 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py
@@ -6,7 +6,6 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-import os
 import json
 
 
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py
index e0e67d4..ef3acd3 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py
@@ -7,11 +7,15 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import numpy as np
-import tensorflow as tf
-import tensorflow.keras as tfk
-import tensorflow.keras.optimizers as tfko
-import tensorflow.keras.layers as tfkl
-import tensorflow.keras.activations as tfka
+try:
+    import tensorflow as tf
+    import tensorflow.keras as tfk
+    import tensorflow.keras.optimizers as tfko
+    import tensorflow.keras.layers as tfkl
+    import tensorflow.keras.activations as tfka
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 
 class DoubleDuelingDQN_NN(object):
@@ -23,6 +27,9 @@ def __init__(self,
                  learning_rate = 1e-5,
                  learning_rate_decay_steps = 1000,
                  learning_rate_decay_rate = 0.95):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         self.action_size = action_size
         self.observation_size = observation_size
         self.lr = learning_rate
diff --git a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
index de33865..c0bbfc7 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
@@ -10,7 +10,6 @@
 
 import os
 import argparse
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
@@ -67,6 +66,8 @@ def evaluate(env,
              verbose=DEFAULT_VERBOSE,
              save_gif=False):
 
+    import tensorflow as tf  # lazy import to save import time
+    
     # Set config
     D3QNConfig.N_FRAMES = num_frames
     D3QNConfig.VERBOSE = verbose
diff --git a/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py b/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
index 278a2ed..94763c7 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
@@ -11,7 +11,7 @@
 
 import argparse
 import json
-import tensorflow as tf
+    
 import numpy as np
 
 from grid2op.MakeEnv import make2
@@ -101,6 +101,7 @@ def print_actions(agent):
         
 
 if __name__ == "__main__":
+    import tensorflow as tf
     args = cli()
     env = make2(args.path_data, action_class=PowerlineChangeAndDispatchAction)
     # Limit gpu usage
diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py
index bf60b6a..c3f2bd8 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/train.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/train.py
@@ -9,7 +9,6 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import argparse
-import tensorflow as tf
 
 from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent
 from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig
@@ -70,7 +69,8 @@ def train(env,
           batch_size= DEFAULT_BATCH_SIZE,
           learning_rate= DEFAULT_LR,
           verbose=DEFAULT_VERBOSE):
-
+    import tensorflow as tf  # lazy import to save import time
+    
     # Set config
     D3QNConfig.LR = learning_rate
     D3QNConfig.N_FRAMES = num_frames
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py
index 722d40d..cda4405 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py
@@ -10,9 +10,13 @@
 import json
 import copy
 import numpy as np
-import tensorflow as tf
+try:
+    import tensorflow as tf
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+
 
-from grid2op.Parameters import Parameters
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
@@ -26,6 +30,10 @@ def __init__(self,
                  action_space,
                  name=__name__,
                  is_training=False):
+                
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         # Call parent constructor
         AgentWithConverter.__init__(self, action_space,
                                     action_space_converter=IdToAct)
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py
index 321fc29..45b384d 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py
@@ -1,4 +1,11 @@
-import os
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
 import json
 
 class DoubleDuelingRDQNConfig():
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py
index 884a7ec..f0ca7d1 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py
@@ -8,13 +8,17 @@
 
 import numpy as np
 import random
-import tensorflow as tf
-import tensorflow.keras as tfk
-import tensorflow.keras.backend as K
-import tensorflow.keras.models as tfkm
-import tensorflow.keras.optimizers as tfko
-import tensorflow.keras.layers as tfkl
-import tensorflow.keras.activations as tfka
+try:
+    import tensorflow as tf
+    import tensorflow.keras as tfk
+    import tensorflow.keras.backend as K
+    import tensorflow.keras.models as tfkm
+    import tensorflow.keras.optimizers as tfko
+    import tensorflow.keras.layers as tfkl
+    import tensorflow.keras.activations as tfka
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 
 class DoubleDuelingRDQN_NN(object):
@@ -22,6 +26,10 @@ def __init__(self,
                  action_size,
                  observation_size,
                  learning_rate = 1e-5):
+        
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         self.action_size = action_size
         self.observation_size = observation_size
         self.h_size = 512
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py b/l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py
index a595382..4ca463d 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py
@@ -8,7 +8,6 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-from collections import deque
 import random
 import numpy as np
 
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
index d003dfb..e4b0166 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
@@ -10,7 +10,6 @@
 
 import os
 import argparse
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
@@ -60,7 +59,7 @@ def evaluate(env,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=DEFAULT_VERBOSE,
              save_gif=False):
-
+    import tensorflow as tf  # lazy import to save import time
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py
index d75f216..160823f 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/train.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py
@@ -9,7 +9,6 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import argparse
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Reward import *
@@ -75,7 +74,8 @@ def train(env,
           batch_size=DEFAULT_BATCH_SIZE,
           learning_rate=DEFAULT_LR,
           verbose=DEFAULT_VERBOSE):
-
+    import tensorflow as tf  # lazy import to save import time
+    
     # Set config
     RDQNConfig.TRACE_LENGTH = trace_length
     RDQNConfig.BATCH_SIZE = batch_size
diff --git a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
index caf77d8..2b35cc8 100644
--- a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
+++ b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
@@ -10,38 +10,33 @@
 
 # tf2.0 friendly
 import warnings
-
-import tensorflow as tf
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    from tensorflow.keras.models import Sequential, Model
-    from tensorflow.keras.layers import Activation
-    from tensorflow.keras.layers import Input, Lambda, subtract, add
-    import tensorflow.keras.backend as K
+try:
+    import tensorflow as tf
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from tensorflow.keras.models import Sequential, Model
+        from tensorflow.keras.layers import Activation
+        from tensorflow.keras.layers import Input, Lambda, subtract, add
+        import tensorflow.keras.backend as K
+    
+    # TODO implement that in the leap net package too
+    from tensorflow.keras.layers import Layer
+    from tensorflow.keras.layers import Dense
+    from tensorflow.keras.layers import add as tfk_add
+    from tensorflow.keras.layers import multiply as tfk_multiply
+    
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+    
+    class Layer(object):
+        """Empty class to be used in the documentation. This should 
+        be `from tensorflow.keras.layers import Layer`
+        """
+        pass
 
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
 
-# try:
-#     from leap_net import Ltau  # this import might change if you use the "quick and dirty way".
-# except ImportError:
-#     # Copyright (c) 2019-2020, RTE (https://www.rte-france.com)
-#     # See AUTHORS.txt
-#     # This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
-#     # If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
-#     # you can obtain one at http://mozilla.org/MPL/2.0/.
-#     # SPDX-License-Identifier: MPL-2.0
-#     # This file is part of leap_net, leap_net a keras implementation of the LEAP Net model.
-# MSG_WARNING = "Leap net model is not installed on your system. Please visit \n" \
-#               "https://github.com/BDonnot/leap_net \n" \
-#               "to have the latest Leap net implementation."
-# warnings.warn(MSG_WARNING)
-
-# TODO implement that in the leap net package too
-from tensorflow.keras.layers import Layer
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.layers import add as tfk_add
-from tensorflow.keras.layers import multiply as tfk_multiply
-
 
 class LtauBis(Layer):
     """
@@ -52,6 +47,9 @@ class LtauBis(Layer):
     """
 
     def __init__(self, initializer='glorot_uniform', use_bias=True, trainable=True, name=None, **kwargs):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         super(LtauBis, self).__init__(trainable=trainable, name=name, **kwargs)
         self.initializer = initializer
         self.use_bias = use_bias
@@ -101,6 +99,9 @@ class DuelQLeapNet_NN(BaseDeepQ):
     def __init__(self,
                  nn_params,
                  training_param=None):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         if training_param is None:
             training_param = TrainingParam()
         BaseDeepQ.__init__(self,
diff --git a/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py b/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py
index b794b80..10c1140 100644
--- a/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py
+++ b/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py
@@ -5,7 +5,6 @@
 # you can obtain one at http://mozilla.org/MPL/2.0/.
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
-import os
 import numpy as np
 import copy
 
@@ -80,4 +79,4 @@ def __init__(self,
         self.tau_mults = tau_mults
 
     def get_obs_attr(self):
-        return self.list_attr_obs + self.list_attr_obs_tau
\ No newline at end of file
+        return self.list_attr_obs + self.list_attr_obs_tau
diff --git a/l2rpn_baselines/DuelQLeapNet/evaluate.py b/l2rpn_baselines/DuelQLeapNet/evaluate.py
index d2d7c22..e8d6b00 100644
--- a/l2rpn_baselines/DuelQLeapNet/evaluate.py
+++ b/l2rpn_baselines/DuelQLeapNet/evaluate.py
@@ -9,21 +9,15 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
-from grid2op.Episode import EpisodeData
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet import DuelQLeapNet, DEFAULT_NAME
 from l2rpn_baselines.DuelQLeapNet.LeapNet_NNParam import LeapNet_NNParam
 from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
 
-import pdb
-
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
 DEFAULT_NB_EPISODE = 1
 DEFAULT_NB_PROCESS = 1
@@ -113,6 +107,7 @@ def evaluate(env,
 
     """
 
+    import tensorflow as tf  # lazy import to save time
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/DuelQLeapNet/train.py b/l2rpn_baselines/DuelQLeapNet/train.py
index f3f5c0e..5fea200 100755
--- a/l2rpn_baselines/DuelQLeapNet/train.py
+++ b/l2rpn_baselines/DuelQLeapNet/train.py
@@ -10,7 +10,6 @@
 
 import os
 import warnings
-import tensorflow as tf
 
 from l2rpn_baselines.utils import cli_train
 from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet import DuelQLeapNet, DEFAULT_NAME
@@ -155,7 +154,7 @@ def train(env,
             env.close()
 
     """
-
+    import tensorflow as tf  # lazy import to save time
     # Limit gpu usage
     try:
         physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/DuelQSimple/DuelQ_NN.py b/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
index 73a2f06..f264cd4 100644
--- a/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
+++ b/l2rpn_baselines/DuelQSimple/DuelQ_NN.py
@@ -9,12 +9,16 @@
 # tf2.0 friendly
 import warnings
 
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    from tensorflow.keras.models import Sequential, Model
-    from tensorflow.keras.layers import Activation, Dense
-    from tensorflow.keras.layers import Input, Lambda, subtract, add
-    import tensorflow.keras.backend as K
+try:
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from tensorflow.keras.models import Sequential, Model
+        from tensorflow.keras.layers import Activation, Dense
+        from tensorflow.keras.layers import Input, Lambda, subtract, add
+        import tensorflow.keras.backend as K
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
 
@@ -24,6 +28,9 @@ class DuelQ_NN(BaseDeepQ):
     def __init__(self,
                  nn_params,
                  training_param=None):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         if training_param is None:
             training_param = TrainingParam()
         BaseDeepQ.__init__(self,
diff --git a/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py b/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py
index aa9d2d4..87fdba0 100644
--- a/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py
+++ b/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py
@@ -35,4 +35,4 @@ def __init__(self,
                          sizes,
                          activs,
                          list_attr_obs
-                         )
\ No newline at end of file
+                         )
diff --git a/l2rpn_baselines/DuelQSimple/evaluate.py b/l2rpn_baselines/DuelQSimple/evaluate.py
index 52acc9c..ba75c4f 100644
--- a/l2rpn_baselines/DuelQSimple/evaluate.py
+++ b/l2rpn_baselines/DuelQSimple/evaluate.py
@@ -9,19 +9,15 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.DuelQSimple.DuelQSimple import DuelQSimple, DEFAULT_NAME
 from l2rpn_baselines.DuelQSimple.DuelQ_NNParam import DuelQ_NNParam
 from l2rpn_baselines.DuelQSimple.DuelQ_NN import DuelQ_NN
 
-import pdb
 
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
 DEFAULT_NB_EPISODE = 1
@@ -116,6 +112,7 @@ def evaluate(env,
 
     """
 
+    import tensorflow as tf
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/DuelQSimple/train.py b/l2rpn_baselines/DuelQSimple/train.py
index 369dea3..8c990b2 100755
--- a/l2rpn_baselines/DuelQSimple/train.py
+++ b/l2rpn_baselines/DuelQSimple/train.py
@@ -10,7 +10,6 @@
 
 import os
 import warnings
-import tensorflow as tf
 
 from l2rpn_baselines.utils import cli_train
 from l2rpn_baselines.DuelQSimple.DuelQSimple import DuelQSimple, DEFAULT_NAME
@@ -142,6 +141,7 @@ def train(env,
 
     """
 
+    import tensorflow as tf  # lazy import to save package import time
     # Limit gpu usage
     try:
         physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/ExpertAgent/ExpertAgent.py b/l2rpn_baselines/ExpertAgent/ExpertAgent.py
index c81cc06..4b0d1bd 100644
--- a/l2rpn_baselines/ExpertAgent/ExpertAgent.py
+++ b/l2rpn_baselines/ExpertAgent/ExpertAgent.py
@@ -5,17 +5,18 @@
 # you can obtain one at http://mozilla.org/MPL/2.0/.
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+from grid2op.Agent import BaseAgent
+from grid2op.Reward import BaseReward, L2RPNReward
+import numpy as np
+import pandas as pd
+import logging
+
 try:
-    from grid2op.Agent import BaseAgent
     from alphaDeesp.expert_operator import expert_operator
-    from alphaDeesp.core.grid2op.Grid2opSimulation import Grid2opSimulation, score_changes_between_two_observations
-    from grid2op.Reward import BaseReward, L2RPNReward
-    import numpy as np
-    import pandas as pd
-    import logging
+    from alphaDeesp.core.grid2op.Grid2opSimulation import Grid2opSimulation
+    _CAN_USE_EXPERT_AGENT = True
 except ImportError as exc_:
-    raise ImportError("ExpertAgent baseline impossible to load the required dependencies for using the model. "
-                      "The error was: \n {}".format(exc_))
+    _CAN_USE_EXPERT_AGENT = False
 
 
 class ExpertAgent(BaseAgent):
@@ -44,8 +45,12 @@ class ExpertAgent(BaseAgent):
     def __init__(self,
                  action_space,
                  observation_space,
-                 name, gridName="IEEE118",
+                 name,
+                 gridName="IEEE118",
                  **kwargs):
+        if not _CAN_USE_EXPERT_AGENT:
+            raise ImportError("ExpertAgent baseline impossible to load the required dependencies for using the model. "
+                             )
         super().__init__(action_space)
         self.name = name
         self.grid = gridName  # IEEE14,IEEE118_R2 (WCCI or Neurips Track Robustness), IEEE118
diff --git a/l2rpn_baselines/ExpertAgent/evaluate.py b/l2rpn_baselines/ExpertAgent/evaluate.py
index 8813f72..3c1ebad 100644
--- a/l2rpn_baselines/ExpertAgent/evaluate.py
+++ b/l2rpn_baselines/ExpertAgent/evaluate.py
@@ -15,16 +15,14 @@
 from grid2op.dtypes import dt_int
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
 
 try:
     from l2rpn_baselines.ExpertAgent import ExpertAgent
     from l2rpn_baselines.utils.save_log_gif import save_log_gif
     from l2rpn_baselines.ExpertAgent.ExpertAgent import other_rewards
+    _CAN_USE_EXPERT_BASELINE = True
 except ImportError as exc_:
-    raise ImportError("ExpertAgent baseline impossible to load the required dependencies for using the model. "
-                      "The error was: \n {}".format(exc_))
+    _CAN_USE_EXPERT_BASELINE = False
 
 
 DEFAULT_LOGS_DIR = "./logs-eval/expert-agent-baseline"
@@ -112,6 +110,9 @@ def evaluate(env,
         -------
         ``None``
     """
+    if not _CAN_USE_EXPERT_BASELINE:
+        raise ImportError("ExpertAgent baseline impossible to load the required dependencies for using the model. "
+                         )
     runner_params = env.get_params_for_runner()
     runner_params["verbose"] = verbose
 
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
index d3596c0..3f8e347 100644
--- a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
+++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
@@ -12,21 +12,21 @@
 # tf2.0 friendly
 import warnings
 
-import tensorflow as tf
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    from tensorflow.keras.models import Sequential, Model
-    from tensorflow.keras.layers import Activation
-    from tensorflow.keras.layers import Input, Lambda, subtract, add
-    import tensorflow.keras.backend as K
-
+try:
+    import tensorflow as tf
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from tensorflow.keras.models import Sequential, Model
+        from tensorflow.keras.layers import Activation
+        from tensorflow.keras.layers import Input, Lambda, subtract, add
+        import tensorflow.keras.backend as K
+    # TODO implement that in the leap net package too
+    from tensorflow.keras.layers import Dense
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+    
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
-
-
-# TODO implement that in the leap net package too
-from tensorflow.keras.layers import Dense
-
-
 from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import LtauBis
 
 
@@ -49,6 +49,9 @@ class LeapNetEncoded_NN(BaseDeepQ):
     def __init__(self,
                  nn_params,
                  training_param=None):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         if training_param is None:
             training_param = TrainingParam()
         BaseDeepQ.__init__(self,
diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py
index fd45852..5f856db 100644
--- a/l2rpn_baselines/LeapNetEncoded/evaluate.py
+++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py
@@ -9,12 +9,9 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
@@ -111,7 +108,8 @@ def evaluate(env,
 
 
     """
-
+    import tensorflow as tf  # lazy import to save import time
+    
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/LeapNetEncoded/study.py b/l2rpn_baselines/LeapNetEncoded/study.py
index d53ebaf..57c2318 100644
--- a/l2rpn_baselines/LeapNetEncoded/study.py
+++ b/l2rpn_baselines/LeapNetEncoded/study.py
@@ -9,13 +9,10 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 import numpy as np
 from tqdm import tqdm
 
 from grid2op.MakeEnv import make
-from grid2op.Reward import *
-from grid2op.Action import *
 
 from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
 from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
@@ -40,6 +37,7 @@ def study(env,
              save_gif=False):
     """study the prediction of the grid_model"""
 
+    import tensorflow as tf
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py
index 1f4623e..bf0ceac 100755
--- a/l2rpn_baselines/LeapNetEncoded/train.py
+++ b/l2rpn_baselines/LeapNetEncoded/train.py
@@ -10,7 +10,6 @@
 
 import os
 import warnings
-import tensorflow as tf
 
 from l2rpn_baselines.utils import cli_train
 from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
@@ -151,7 +150,8 @@ def train(env,
             env.close()
 
     """
-
+    import tensorflow as tf  # lazy import to save import time
+    
     # Limit gpu usage
     try:
         physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/SACOld/SACOld_NN.py b/l2rpn_baselines/SACOld/SACOld_NN.py
index 762d761..7d77a41 100644
--- a/l2rpn_baselines/SACOld/SACOld_NN.py
+++ b/l2rpn_baselines/SACOld/SACOld_NN.py
@@ -8,16 +8,19 @@
 
 import numpy as np
 import os
-import tensorflow as tf
 
 # tf2.0 friendly
 import warnings
-
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    from tensorflow.keras.models import load_model, Sequential, Model
-    from tensorflow.keras.layers import Activation, Dense
-    from tensorflow.keras.layers import Input, Concatenate
+try:
+    import tensorflow as tf
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from tensorflow.keras.models import Sequential, Model
+        from tensorflow.keras.layers import Activation, Dense
+        from tensorflow.keras.layers import Input, Concatenate
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
 
@@ -50,6 +53,9 @@ def __init__(self,
                  nn_params,
                  training_param=None,
                  verbose=False):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         if training_param is None:
             training_param = TrainingParam()
         BaseDeepQ.__init__(self,
diff --git a/l2rpn_baselines/SACOld/evaluate.py b/l2rpn_baselines/SACOld/evaluate.py
index c4a710d..4aaabc0 100644
--- a/l2rpn_baselines/SACOld/evaluate.py
+++ b/l2rpn_baselines/SACOld/evaluate.py
@@ -9,12 +9,9 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
 from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME
@@ -109,6 +106,7 @@ def evaluate(env,
                  save_gif=False)
     """
 
+    import tensorflow as tf
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/SACOld/train.py b/l2rpn_baselines/SACOld/train.py
index 914a623..a6712de 100755
--- a/l2rpn_baselines/SACOld/train.py
+++ b/l2rpn_baselines/SACOld/train.py
@@ -9,7 +9,6 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-import tensorflow as tf
 import warnings
 
 from l2rpn_baselines.utils import cli_train
@@ -148,6 +147,7 @@ def train(env,
 
     """
 
+    import tensorflow as tf
     # Limit gpu usage
     try:
         physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/SliceRDQN/ExperienceBuffer.py b/l2rpn_baselines/SliceRDQN/ExperienceBuffer.py
index d4303fe..0d9d6fb 100644
--- a/l2rpn_baselines/SliceRDQN/ExperienceBuffer.py
+++ b/l2rpn_baselines/SliceRDQN/ExperienceBuffer.py
@@ -8,7 +8,6 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-from collections import deque
 import random
 import numpy as np
 
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN.py
index 478ba8f..5aaeb7f 100644
--- a/l2rpn_baselines/SliceRDQN/SliceRDQN.py
+++ b/l2rpn_baselines/SliceRDQN/SliceRDQN.py
@@ -10,9 +10,13 @@
 import json
 import copy
 import numpy as np
-import tensorflow as tf
 
-from grid2op.Parameters import Parameters
+try:
+    import tensorflow as tf   
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+    
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
@@ -28,6 +32,9 @@ def __init__(self,
                  action_space,
                  name=__name__,
                  is_training=False):
+        if not _CAN_USE_TENSORFLOW:
+            raise ImportError("Cannot import tensorflow, this function cannot be used.")
+        
         # Call parent constructor
         AgentWithConverter.__init__(self, action_space,
                                     action_space_converter=IdToAct)
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py
index bd61ce5..774bd44 100644
--- a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py
+++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
 import os
 import json
 
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py
index 3c9d832..2da6ed8 100644
--- a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py
+++ b/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py
@@ -7,14 +7,18 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import numpy as np
-import random
-import tensorflow as tf
-import tensorflow.keras as tfk
-import tensorflow.keras.backend as K
-import tensorflow.keras.models as tfkm
-import tensorflow.keras.optimizers as tfko
-import tensorflow.keras.layers as tfkl
-import tensorflow.keras.activations as tfka
+
+try:
+    import tensorflow as tf
+    import tensorflow.keras as tfk
+    import tensorflow.keras.backend as K
+    import tensorflow.keras.models as tfkm
+    import tensorflow.keras.optimizers as tfko
+    import tensorflow.keras.layers as tfkl
+    import tensorflow.keras.activations as tfka
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 
 class SliceRDQN_NN(object):
@@ -23,6 +27,9 @@ def __init__(self,
                  observation_shape,
                  slices,
                  learning_rate = 1e-5):
+        if not _CAN_USE_TENSORFLOW:
+            raise ImportError("Cannot import tensorflow, this function cannot be used.")
+        
         self.action_size = action_size
         self.observation_shape = observation_shape
         self.slices = slices
diff --git a/l2rpn_baselines/SliceRDQN/evaluate.py b/l2rpn_baselines/SliceRDQN/evaluate.py
index b977e78..daa925e 100755
--- a/l2rpn_baselines/SliceRDQN/evaluate.py
+++ b/l2rpn_baselines/SliceRDQN/evaluate.py
@@ -10,13 +10,8 @@
 
 import os
 import argparse
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
-from grid2op.Runner import Runner
-from grid2op.Reward import *
-from grid2op.Action import *
-
 from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 
@@ -60,6 +55,7 @@ def evaluate(env,
              verbose=DEFAULT_VERBOSE,
              save_gif=False):
 
+    import tensorflow as tf
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
     if len(physical_devices):
diff --git a/l2rpn_baselines/SliceRDQN/slice_util.py b/l2rpn_baselines/SliceRDQN/slice_util.py
index 2386d62..9bc81c4 100644
--- a/l2rpn_baselines/SliceRDQN/slice_util.py
+++ b/l2rpn_baselines/SliceRDQN/slice_util.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
 import numpy as np    
 
 def lines_q_len(action_space):
diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py
index a5a8c0d..5539426 100755
--- a/l2rpn_baselines/SliceRDQN/train.py
+++ b/l2rpn_baselines/SliceRDQN/train.py
@@ -9,11 +9,8 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import argparse
-import tensorflow as tf
 
 from grid2op.MakeEnv import make
-from grid2op.Reward import *
-from grid2op.Action import *
 from grid2op.Parameters import Parameters
 
 from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent
@@ -78,6 +75,7 @@ def train(env,
           learning_rate=DEFAULT_LR,
           verbose=DEFAULT_VERBOSE):
 
+    import tensorflow as tf
     # Set config
     RDQNConfig.LR = learning_rate
     RDQNConfig.BATCH_SIZE = batch_size
diff --git a/l2rpn_baselines/ppo_stablebaselines/__init__.py b/l2rpn_baselines/ppo_stablebaselines/__init__.py
index 98c2da4..dbf9799 100644
--- a/l2rpn_baselines/ppo_stablebaselines/__init__.py
+++ b/l2rpn_baselines/ppo_stablebaselines/__init__.py
@@ -1,8 +1,9 @@
 __all__ = [
     "evaluate",
-    "train"
+    "train",
+    "PPOSB_Agent"
 ]
 
-# from l2rpn_baselines.ppo_stablebaselines.DuelQSimple import DuelQSimple
+from l2rpn_baselines.ppo_stablebaselines.utils import SB3Agent as PPOSB_Agent
 from l2rpn_baselines.ppo_stablebaselines.evaluate import evaluate
 from l2rpn_baselines.ppo_stablebaselines.train import train
diff --git a/l2rpn_baselines/ppo_stablebaselines/train.py b/l2rpn_baselines/ppo_stablebaselines/train.py
index edb93a8..9c7bc2d 100644
--- a/l2rpn_baselines/ppo_stablebaselines/train.py
+++ b/l2rpn_baselines/ppo_stablebaselines/train.py
@@ -11,14 +11,24 @@
 import copy
 import os
 import grid2op
-from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
-
 import json
 
-from stable_baselines3.common.callbacks import CheckpointCallback
-from stable_baselines3 import PPO
-from stable_baselines3.ppo import MlpPolicy
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
 
+try:
+    from stable_baselines3.common.callbacks import CheckpointCallback
+    from stable_baselines3 import PPO
+    from stable_baselines3.ppo import MlpPolicy
+    _CAN_USE_STABLE_BASELINE = True
+except ImportError:
+    _CAN_USE_STABLE_BASELINE = False
+    class MlpPolicy(object):
+        """
+        Do not use, this class is a template when stable baselines3 is not installed.
+        
+        It represents `from stable_baselines3.ppo import MlpPolicy`
+        """
+    
 _default_obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
                              "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
                              "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
@@ -130,7 +140,7 @@ def train(env,
         import grid2op
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
         from grid2op.Chronics import MultifolderWithCache  # highly recommended
-        from lightsim2grid import LightSimBackend  # highly recommended !
+        from lightsim2grid import LightSimBackend  # highly recommended for training !
         from l2rpn_baselines.ppo_stablebaselines import train
 
         env_name = "l2rpn_case14_sandbox"
@@ -157,6 +167,8 @@ def train(env,
             env.close()
 
     """
+    if not _CAN_USE_STABLE_BASELINE:
+        raise ImportError("Cannot use this function as stable baselines3 is not installed")
     if act_attr_to_keep == _default_act_attr_to_keep:
         # by default, i remove all the attributes that are not supported by the action type
         # i do not do that if the user specified specific attributes to keep. This is his responsibility in
@@ -240,7 +252,7 @@ def train(env,
     import grid2op
     from grid2op.Reward import LinesCapacityReward  # or any other rewards
     from lightsim2grid import LightSimBackend  # highly recommended !
-    from grid2op.Chronics import MultifolderWithCache  # highly recommended
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
 
     env_name = "l2rpn_case14_sandbox"
     env = grid2op.make(env_name,
@@ -248,41 +260,16 @@ def train(env,
                        backend=LightSimBackend(),
                        chronics_class=MultifolderWithCache)
 
-    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*0$", x) is not None)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
 
     train(env,
-          iterations=10_000,
+          iterations=10_000_000,
           logs_dir="./logs",
           save_path="./saved_model", 
-          name="test",
-          net_arch=[100, 100, 100],
+          name="test2",
+          net_arch=[200, 200, 200],
           save_every_xxx_steps=2000,
           )
-
-
-    # from grid2op.Action import CompleteAction
-    # from grid2op.Reward import LinesCapacityReward
-    # from lightsim2grid import LightSimBackend
-    # from grid2op.Chronics import MultifolderWithCache
-
-    # env = grid2op.make("educ_case14_storage",
-    #                    test=True,
-    #                    action_class=CompleteAction,
-    #                    reward_class=LinesCapacityReward,
-    #                    backend=LightSimBackend(),
-    #                    chronics_class=MultifolderWithCache)
-
-    # env.chronics_handler.real_data.set_filter(lambda x: True)
-    # env.chronics_handler.real_data.reset()
-
-    # train(env,
-    #       iterations=10_000,
-    #       logs_dir="./logs",
-    #       save_path="./saved_model", 
-    #       name="test4",
-    #       net_arch=[100, 100, 100],
-    #       save_every_xxx_steps=2000,
-    #       )
diff --git a/l2rpn_baselines/ppo_stablebaselines/utils.py b/l2rpn_baselines/ppo_stablebaselines/utils.py
index 6268e34..250c090 100644
--- a/l2rpn_baselines/ppo_stablebaselines/utils.py
+++ b/l2rpn_baselines/ppo_stablebaselines/utils.py
@@ -9,19 +9,71 @@
 
 from l2rpn_baselines.utils import GymAgent
 
-from stable_baselines3 import PPO
+try:
+    from stable_baselines3 import PPO
+except ImportError:
+    _CAN_USE_STABLE_BASELINE = False
+    class PPO(object):
+        """
+        Do not use, this class is a template when stable baselines3 is not installed.
+        
+        It represents `from stable_baselines3 import PPO`
+        """
+
 
 class SB3Agent(GymAgent):
-    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path, nn_type=PPO):
+    def __init__(self,
+                 g2op_action_space,
+                 gym_act_space,
+                 gym_obs_space,
+                 nn_type=PPO,
+                 nn_path=None,
+                 nn_kwargs=None):
         self._nn_type = nn_type
-        super().__init__(g2op_action_space, gym_act_space, gym_obs_space, nn_path)
+        super().__init__(g2op_action_space, gym_act_space, gym_obs_space,
+                         nn_path=nn_path, nn_kwargs=nn_kwargs)
         
     def get_act(self, gym_obs, reward, done):
+        """Retrieve the gym action from the gym observation and the reward. 
+        It only (for now) work for non recurrent policy.
+
+        Parameters
+        ----------
+        gym_obs : gym observation
+            The gym observation
+        reward : ``float``
+            the current reward
+        done : ``bool``
+            whether the episode is over or not.
+
+        Returns
+        -------
+        gym action
+            The gym action, that is processed in the :func:`GymAgent.act`
+            to be used with grid2op
+        """
         action, _ = self.nn_model.predict(gym_obs, deterministic=True)
         return action
 
     def load(self):
         """
-        Load the NN models
+        Load the NN model.
+        
+        In the case of a PPO agent, this is equivalent to perform the:
+        
+        .. code-block:: python
+            
+            PPO.load(nn_path)
         """
         self.nn_model = self._nn_type.load(self._nn_path)
+        
+    def build(self):
+        """Create the underlying NN model from scratch.
+        
+        In the case of a PPO agent, this is equivalent to perform the:
+        
+        .. code-block:: python
+            
+            PPO(**nn_kwargs)
+        """
+        self.nn_model = PPO(**self._nn_kwargs)
diff --git a/l2rpn_baselines/utils/BaseDeepQ.py b/l2rpn_baselines/utils/BaseDeepQ.py
index 5a1f1cd..5db864f 100644
--- a/l2rpn_baselines/utils/BaseDeepQ.py
+++ b/l2rpn_baselines/utils/BaseDeepQ.py
@@ -10,8 +10,15 @@
 from abc import ABC, abstractmethod
 import numpy as np
 import warnings
-import tensorflow as tf
-import tensorflow.keras.optimizers as tfko
+
+try:
+    import tensorflow as tf
+    import tensorflow.keras.optimizers as tfko
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
+    
+
 
 from l2rpn_baselines.utils.TrainingParam import TrainingParam
 
@@ -71,6 +78,9 @@ def __init__(self,
                  nn_params,
                  training_param=None,
                  verbose=False):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         self._action_size = nn_params.action_size
         self._observation_size = nn_params.observation_size
         self._nn_archi = nn_params
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index 5a78346..2a76764 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -10,7 +10,6 @@
 import warnings
 import numpy as np
 from tqdm import tqdm
-import tensorflow as tf
 
 import grid2op
 from grid2op.Exceptions import Grid2OpException
@@ -26,6 +25,11 @@
 except ImportError:
     _CACHE_AVAILABLE_DEEPQAGENT = False
 
+try:
+    import tensorflow as tf
+    _CAN_USE_TENSORFLOW = True
+except ImportError:
+    _CAN_USE_TENSORFLOW = False
 
 class DeepQAgent(AgentWithConverter):
     """
@@ -127,6 +131,9 @@ def __init__(self,
                  verbose=False,
                  observation_space=None,
                  **kwargs_converters):
+        if not _CAN_USE_TENSORFLOW:
+            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
+        
         AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct, **kwargs_converters)
         self.filter_action_fun = filter_action_fun
         if self.filter_action_fun is not None:
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 803f545..3ff8cd8 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -7,8 +7,11 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 from abc import abstractmethod
+import copy
 
 from grid2op.Agent import BaseAgent
+from grid2op.Observation import BaseObservation
+from grid2op.Action import BaseAction
 
 
 class GymAgent(BaseAgent):
@@ -21,15 +24,51 @@ class GymAgent(BaseAgent):
 
     Use it only with a trained agent. It does not provide the "save" method and
     is not suitable for training.
+    
+    ..info::
+        To load a previously saved agent the function `GymAgent.load` will be called
+        and you must provide the `nn_path` keyword argument.
+        
+        To build a new agent, the function `GymAgent.build` is called and
+        you must provide the `nn_kwargs` keyword argument.
+        
+        You cannot set both, you have to set one.
     """
-    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path):
+    def __init__(self,
+                 g2op_action_space,
+                 gym_act_space,
+                 gym_obs_space,
+                 *,  # to prevent positional argument
+                 nn_path=None,
+                 nn_kwargs=None):
         super().__init__(g2op_action_space)
         self._gym_act_space = gym_act_space
         self._gym_obs_space = gym_obs_space
-        self._nn_path = nn_path
+        if nn_path is None and nn_kwargs is None:
+            raise RuntimeError("Impossible to build a GymAgent without providing at "
+                               "least one of `nn_path` (to load the agent from disk) "
+                               "or `nn_kwargs` (to create the underlying agent).")
+        if nn_path is not None and nn_kwargs is not None:
+            raise RuntimeError("Impossible to build a GymAgent by providing both "
+                               "`nn_path` (*ie* you want load the agent from disk) "
+                               "and `nn_kwargs` (*ie* you want to create the underlying agent from these "
+                               "parameters).")
+        if nn_path is not None:
+            self._nn_path = nn_path
+        else:
+            self._nn_path = None
+            
+        if nn_kwargs is not None:
+            self._nn_kwargs = copy.deepcopy(nn_kwargs)
+        else:
+            self._nn_kwargs = None
+        
         self.nn_model = None
-        self.load()
-
+        if nn_path is not None:
+            self.load()
+        else:
+            self.build()
+            
     @abstractmethod
     def get_act(self, gym_obs, reward, done):
         """
@@ -40,11 +79,47 @@ def get_act(self, gym_obs, reward, done):
     @abstractmethod
     def load(self):
         """
-        Load the NN models
+        Load the NN model
+        
+        ..info:: Only called if the agent has been build with `nn_path` not None and `nn_kwargs=None`
         """
         pass
+    
+    @abstractmethod
+    def build(self):
+        """
+        Build the NN model.
+        
+        ..info:: Only called if the agent has been build with `nn_path=None` and `nn_kwargs` not None
+        """
+        pass
+        
+    def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAction:
+        """This function is called to "map" the grid2op world
+        into a usable format by a neural networks (for example in a format
+        usable by stable baselines or ray/rllib)
 
-    def act(self, observation, reward, done):
+        Parameters
+        ----------
+        observation : BaseObservation
+            The grid2op observation
+        reward : ``float``
+            The reward
+        done : function
+            the flag "done" by open ai gym.
+
+        Returns
+        -------
+        BaseAction
+            The action taken by the agent, in a form of a grid2op BaseAction.
+        
+        Notes
+        -------
+        In case your "real agent" wants to implement some "non learned" heuristic,
+        you can also put them here.
+        
+        In this case the "gym agent" will only be used in particular settings.
+        """
         gym_obs = self._gym_obs_space.to_gym(observation)
         gym_act = self.get_act(gym_obs, reward, done)
         grid2op_act = self._gym_act_space.from_gym(gym_act)
diff --git a/l2rpn_baselines/utils/save_log_gif.py b/l2rpn_baselines/utils/save_log_gif.py
index 647b8c9..1f9a38a 100644
--- a/l2rpn_baselines/utils/save_log_gif.py
+++ b/l2rpn_baselines/utils/save_log_gif.py
@@ -7,8 +7,12 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
-from grid2op.Episode import EpisodeReplay
-
+try:
+    from grid2op.Episode import EpisodeReplay
+    _CAN_USE = True
+except ImportError:
+    # cannot use the save_log_gif function
+    _CAN_USE = False
 
 def save_log_gif(path_log, res, gif_name=None):
     """
@@ -27,6 +31,10 @@ def save_log_gif(path_log, res, gif_name=None):
         Name of the gif that will be used.
 
     """
+    if not _CAN_USE:
+        raise RuntimeError("Cannot use the \"save_log_gif\" function as the "
+                           "\"from grid2op.Episode import EpisodeReplay\" cannot be imported")
+    
     init_gif_name = gif_name
     ep_replay = EpisodeReplay(path_log)
     for _, chron_name, cum_reward, nb_time_step, max_ts in res:
diff --git a/setup.py b/setup.py
index 8df191d..1d0d153 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,8 @@
         "grid2op",
         "statsmodels>=0.11.1",
         "scipy>=1.4.1",
-        "numpy"
+        "numpy",
+        "gym>=0.17.2"
     ],
     "extras": {
         "docs": [

From 0a7622d6e072f14e586f3608e08b73660ac3c2ea Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 17:39:31 +0100
Subject: [PATCH 07/56] should be ok

---
 .gitignore                                    |  4 ++
 CHANGELOG.rst                                 | 11 ++++
 docs/{DeepQSimple.rst => deepqsimple.rst}     |  3 +-
 ...bleDuelingDQN.rst => doubleduelingdqn.rst} |  0
 ...eDuelingRDQN.rst => doubleduelingrdqn.rst} |  0
 docs/{DuelQLeapNet.rst => duelqleapnet.rst}   |  0
 docs/{DuelQSimple.rst => duelqsimple.rst}     |  0
 docs/{ExpertAgent.rst => expertagent.rst}     |  0
 docs/external_contributions.rst               | 32 +++++++++++
 docs/index.rst                                | 23 +++++---
 ...{LeapNetEncoded.rst => leapnetencoded.rst} |  0
 docs/ppo_stable_baselines.rst                 | 56 ++++++++-----------
 docs/{SACOld.rst => sacold.rst}               |  0
 l2rpn_baselines/DeepQSimple/__init__.py       |  4 +-
 .../{DeepQSimple.py => deepQSimple.py}        |  0
 .../DeepQSimple/{DeepQ_NN.py => deepQ_NN.py}  |  2 -
 .../{DeepQ_NNParam.py => deepQ_NNParam.py}    | 11 ++--
 l2rpn_baselines/DeepQSimple/evaluate.py       |  9 +--
 l2rpn_baselines/DeepQSimple/train.py          |  6 +-
 l2rpn_baselines/DoNothing/__init__.py         |  2 +-
 .../DoNothing/{DoNothing.py => doNothing.py}  |  0
 l2rpn_baselines/DoubleDuelingDQN/__init__.py  |  4 +-
 ...oubleDuelingDQN.py => doubleDuelingDQN.py} |  4 +-
 ...DQNConfig.py => doubleDuelingDQNConfig.py} |  0
 ...uelingDQN_NN.py => doubleDuelingDQN_NN.py} |  0
 l2rpn_baselines/DoubleDuelingDQN/evaluate.py  |  4 +-
 .../DoubleDuelingDQN/inspect_action_space.py  |  2 +-
 l2rpn_baselines/DoubleDuelingDQN/train.py     |  4 +-
 l2rpn_baselines/DoubleDuelingRDQN/__init__.py |  4 +-
 ...bleDuelingRDQN.py => doubleDuelingRDQN.py} |  6 +-
 ...QNConfig.py => doubleDuelingRDQNConfig.py} |  0
 ...lingRDQN_NN.py => doubleDuelingRDQN_NN.py} |  0
 l2rpn_baselines/DoubleDuelingRDQN/evaluate.py |  4 +-
 ...xperienceBuffer.py => experienceBuffer.py} |  0
 l2rpn_baselines/DoubleDuelingRDQN/train.py    |  4 +-
 l2rpn_baselines/DuelQLeapNet/__init__.py      |  4 +-
 .../{DuelQLeapNet.py => duelQLeapNet.py}      |  4 +-
 ...{DuelQLeapNet_NN.py => duelQLeapNet_NN.py} |  0
 l2rpn_baselines/DuelQLeapNet/evaluate.py      | 12 ++--
 ...{LeapNet_NNParam.py => leapNet_NNParam.py} |  2 +-
 l2rpn_baselines/DuelQLeapNet/train.py         | 10 ++--
 l2rpn_baselines/DuelQSimple/__init__.py       |  4 +-
 .../{DuelQSimple.py => duelQSimple.py}        |  0
 .../DuelQSimple/{DuelQ_NN.py => duelQ_NN.py}  |  0
 .../{DuelQ_NNParam.py => duelQ_NNParam.py}    |  2 +-
 l2rpn_baselines/DuelQSimple/evaluate.py       |  6 +-
 l2rpn_baselines/DuelQSimple/train.py          |  6 +-
 l2rpn_baselines/ExpertAgent/__init__.py       |  4 +-
 l2rpn_baselines/ExpertAgent/evaluate.py       |  2 +-
 .../{ExpertAgent.py => expertAgent.py}        |  0
 l2rpn_baselines/LeapNetEncoded/__init__.py    |  4 +-
 l2rpn_baselines/LeapNetEncoded/evaluate.py    |  6 +-
 .../{LeapNetEncoded.py => leapNetEncoded.py}  |  6 +-
 ...pNetEncoded_NN.py => leapNetEncoded_NN.py} |  4 +-
 ...d_NNParam.py => leapNetEncoded_NNParam.py} |  4 +-
 l2rpn_baselines/LeapNetEncoded/study.py       | 22 ++++----
 l2rpn_baselines/LeapNetEncoded/train.py       | 17 +++---
 l2rpn_baselines/PPO_SB3/__init__.py           | 17 ++++++
 .../evaluate.py                               |  2 +-
 .../{ppo_stablebaselines => PPO_SB3}/train.py |  0
 .../{ppo_stablebaselines => PPO_SB3}/utils.py |  0
 l2rpn_baselines/SliceRDQN/__init__.py         |  4 +-
 l2rpn_baselines/SliceRDQN/evaluate.py         |  2 +-
 ...xperienceBuffer.py => experienceBuffer.py} |  0
 .../SliceRDQN/{SliceRDQN.py => sliceRDQN.py}  |  6 +-
 ...liceRDQN_Config.py => sliceRDQN_Config.py} |  0
 .../{SliceRDQN_NN.py => sliceRDQN_NN.py}      |  0
 l2rpn_baselines/SliceRDQN/train.py            |  4 +-
 l2rpn_baselines/Template/__init__.py          |  4 +-
 l2rpn_baselines/Template/evaluate.py          |  2 +-
 .../Template/{Template.py => template.py}     |  0
 l2rpn_baselines/Template/train.py             |  2 +-
 .../ppo_stablebaselines/__init__.py           |  9 ---
 l2rpn_baselines/test/test_import.py           |  6 +-
 l2rpn_baselines/utils/__init__.py             | 23 +++++++-
 .../utils/{BaseDeepQ.py => baseDeepQ.py}      |  2 +-
 .../utils/{DeepQAgent.py => deepQAgent.py}    |  5 +-
 l2rpn_baselines/utils/gymAgent.py             | 25 +++++++++
 .../utils/{NNParam.py => nnParam.py}          |  2 +-
 .../{ReplayBuffer.py => replayBuffer.py}      |  0
 .../utils/{RLAgent.py => rlAgent.py}          |  3 +-
 .../{TrainingParam.py => trainingParam.py}    |  0
 82 files changed, 270 insertions(+), 166 deletions(-)
 rename docs/{DeepQSimple.rst => deepqsimple.rst} (94%)
 rename docs/{DoubleDuelingDQN.rst => doubleduelingdqn.rst} (100%)
 rename docs/{DoubleDuelingRDQN.rst => doubleduelingrdqn.rst} (100%)
 rename docs/{DuelQLeapNet.rst => duelqleapnet.rst} (100%)
 rename docs/{DuelQSimple.rst => duelqsimple.rst} (100%)
 rename docs/{ExpertAgent.rst => expertagent.rst} (100%)
 create mode 100644 docs/external_contributions.rst
 rename docs/{LeapNetEncoded.rst => leapnetencoded.rst} (100%)
 rename docs/{SACOld.rst => sacold.rst} (100%)
 rename l2rpn_baselines/DeepQSimple/{DeepQSimple.py => deepQSimple.py} (100%)
 rename l2rpn_baselines/DeepQSimple/{DeepQ_NN.py => deepQ_NN.py} (96%)
 rename l2rpn_baselines/DeepQSimple/{DeepQ_NNParam.py => deepQ_NNParam.py} (85%)
 rename l2rpn_baselines/DoNothing/{DoNothing.py => doNothing.py} (100%)
 rename l2rpn_baselines/DoubleDuelingDQN/{DoubleDuelingDQN.py => doubleDuelingDQN.py} (99%)
 rename l2rpn_baselines/DoubleDuelingDQN/{DoubleDuelingDQNConfig.py => doubleDuelingDQNConfig.py} (100%)
 rename l2rpn_baselines/DoubleDuelingDQN/{DoubleDuelingDQN_NN.py => doubleDuelingDQN_NN.py} (100%)
 rename l2rpn_baselines/DoubleDuelingRDQN/{DoubleDuelingRDQN.py => doubleDuelingRDQN.py} (98%)
 rename l2rpn_baselines/DoubleDuelingRDQN/{DoubleDuelingRDQNConfig.py => doubleDuelingRDQNConfig.py} (100%)
 rename l2rpn_baselines/DoubleDuelingRDQN/{DoubleDuelingRDQN_NN.py => doubleDuelingRDQN_NN.py} (100%)
 rename l2rpn_baselines/DoubleDuelingRDQN/{ExperienceBuffer.py => experienceBuffer.py} (100%)
 rename l2rpn_baselines/DuelQLeapNet/{DuelQLeapNet.py => duelQLeapNet.py} (79%)
 rename l2rpn_baselines/DuelQLeapNet/{DuelQLeapNet_NN.py => duelQLeapNet_NN.py} (100%)
 rename l2rpn_baselines/DuelQLeapNet/{LeapNet_NNParam.py => leapNet_NNParam.py} (98%)
 rename l2rpn_baselines/DuelQSimple/{DuelQSimple.py => duelQSimple.py} (100%)
 rename l2rpn_baselines/DuelQSimple/{DuelQ_NN.py => duelQ_NN.py} (100%)
 rename l2rpn_baselines/DuelQSimple/{DuelQ_NNParam.py => duelQ_NNParam.py} (95%)
 rename l2rpn_baselines/ExpertAgent/{ExpertAgent.py => expertAgent.py} (100%)
 rename l2rpn_baselines/LeapNetEncoded/{LeapNetEncoded.py => leapNetEncoded.py} (78%)
 rename l2rpn_baselines/LeapNetEncoded/{LeapNetEncoded_NN.py => leapNetEncoded_NN.py} (98%)
 rename l2rpn_baselines/LeapNetEncoded/{LeapNetEncoded_NNParam.py => leapNetEncoded_NNParam.py} (98%)
 create mode 100644 l2rpn_baselines/PPO_SB3/__init__.py
 rename l2rpn_baselines/{ppo_stablebaselines => PPO_SB3}/evaluate.py (99%)
 rename l2rpn_baselines/{ppo_stablebaselines => PPO_SB3}/train.py (100%)
 rename l2rpn_baselines/{ppo_stablebaselines => PPO_SB3}/utils.py (100%)
 rename l2rpn_baselines/SliceRDQN/{ExperienceBuffer.py => experienceBuffer.py} (100%)
 rename l2rpn_baselines/SliceRDQN/{SliceRDQN.py => sliceRDQN.py} (98%)
 rename l2rpn_baselines/SliceRDQN/{SliceRDQN_Config.py => sliceRDQN_Config.py} (100%)
 rename l2rpn_baselines/SliceRDQN/{SliceRDQN_NN.py => sliceRDQN_NN.py} (100%)
 rename l2rpn_baselines/Template/{Template.py => template.py} (100%)
 delete mode 100644 l2rpn_baselines/ppo_stablebaselines/__init__.py
 rename l2rpn_baselines/utils/{BaseDeepQ.py => baseDeepQ.py} (99%)
 rename l2rpn_baselines/utils/{DeepQAgent.py => deepQAgent.py} (99%)
 rename l2rpn_baselines/utils/{NNParam.py => nnParam.py} (99%)
 rename l2rpn_baselines/utils/{ReplayBuffer.py => replayBuffer.py} (100%)
 rename l2rpn_baselines/utils/{RLAgent.py => rlAgent.py} (95%)
 rename l2rpn_baselines/utils/{TrainingParam.py => trainingParam.py} (100%)

diff --git a/.gitignore b/.gitignore
index d3d3583..fe9ebcc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,7 @@ l2rpn_baselines/ppo_stablebaselines/saved_model
 test_Eva.py
 test_box_act.py
 test_eva_dn.py
+test_import_pposb.py
+test_make_gym_env.py
+test_multifolderwithcache.py
+l2rpn_baselines/PPO_SB3/saved_model/**
\ No newline at end of file
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index a5bdd8f..3688105 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -2,8 +2,19 @@ Change Log
 ===========
 [TODO]
 --------
+- code a baseline example using ray / rllib
+- code a baseline example using mazerl
 - stack multiple states in `utils/DeepQAgent`
 
+[0.6.0] - 2022-xx-yy
+--------------------
+- [BREAKING] name of the file inside the submodule are now lowercase (PEP 8 compliance)
+  Use `from l2rpn_baselines.[BASELINENAME] import [BASELINENAME]` by replacing 
+  `[BASELINENAME]` with ... the baseline name (*eg* `from l2rpn_baselines.DoNothing import DoNothing`)
+- [FIXED] clean the documentation
+- [FIXED] some bugs (especially in the type of actions) for some agents
+- [ADDED] a code example to use stable baselines 3 (see l2rpn_baselines.ppo_stable_baselines)
+
 [0.5.1] - 2021-04-09
 ---------------------
 - [FIXED] issue with grid2op version >= 1.2.3 for some baselines
diff --git a/docs/DeepQSimple.rst b/docs/deepqsimple.rst
similarity index 94%
rename from docs/DeepQSimple.rst
rename to docs/deepqsimple.rst
index d3031a7..1875222 100644
--- a/docs/DeepQSimple.rst
+++ b/docs/deepqsimple.rst
@@ -25,11 +25,12 @@ You can use this class with:
 Other non exported class
 ------------------------
 These classes need to be imported, if you want to import them with (non exhaustive list):
+
 .. code-block:: python
 
     from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
 
 
-.. autoclass:: l2rpn_baselines.DeepQSimple.DeepQ_NN.DeepQ_NN
+.. autoclass:: l2rpn_baselines.DeepQSimple.deepQ_NN.DeepQ_NN
     :members:
     :autosummary:
diff --git a/docs/DoubleDuelingDQN.rst b/docs/doubleduelingdqn.rst
similarity index 100%
rename from docs/DoubleDuelingDQN.rst
rename to docs/doubleduelingdqn.rst
diff --git a/docs/DoubleDuelingRDQN.rst b/docs/doubleduelingrdqn.rst
similarity index 100%
rename from docs/DoubleDuelingRDQN.rst
rename to docs/doubleduelingrdqn.rst
diff --git a/docs/DuelQLeapNet.rst b/docs/duelqleapnet.rst
similarity index 100%
rename from docs/DuelQLeapNet.rst
rename to docs/duelqleapnet.rst
diff --git a/docs/DuelQSimple.rst b/docs/duelqsimple.rst
similarity index 100%
rename from docs/DuelQSimple.rst
rename to docs/duelqsimple.rst
diff --git a/docs/ExpertAgent.rst b/docs/expertagent.rst
similarity index 100%
rename from docs/ExpertAgent.rst
rename to docs/expertagent.rst
diff --git a/docs/external_contributions.rst b/docs/external_contributions.rst
new file mode 100644
index 0000000..dc2aa27
--- /dev/null
+++ b/docs/external_contributions.rst
@@ -0,0 +1,32 @@
+.. currentmodule:: l2rpn_baselines.DeepQSimple
+
+External Contributions
+===========================================================
+
+Description
+-----------
+In this section you can find some examples made by other persons that are 
+included into l2rpn-baselines if you download it with github.
+
+You can find more information in the associated github.
+
+AsynchronousActorCritic 
+-------------------------
+github: https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git
+
+Geirina
+-----------
+guthub: https://github.com/djmax008/GEIRINA_baseline
+
+Kaist
+-------
+github: https://github.com/sunghoonhong/L2RPN-WCCI-2020-Winner
+
+PandapowerOPFAgent
+--------------------
+github: https://github.com/jhmenke/grid2op_pp_baseline
+
+
+.. warning::
+    We do not maintain any of these repository. If you have trouble to make them work
+    please contact directly their authors.
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 4b97742..143c7a6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,14 +21,20 @@ How to contribute
 Baseline already Available
 ---------------------------
 
+These are the "baselines" that are available. Please note that each of these baselines
+is provided as an example of what can be achieved with grid2op.
+
+It can serve a possible implementation for a usecase. At the moment, we do not provide
+baseline with hyper parameters tuned that performs correctly.
+
 .. toctree::
    :maxdepth: 2
 
    utils
-   DeepQSimple
-   DoubleDuelingDQN
-   DuelQSimple
-   ExpertAgent
+   deepqsimple
+   doubleduelingdqn
+   duelqsimple
+   expertagent
    ppo_stable_baselines
 
 
@@ -38,9 +44,10 @@ More advanced baselines
 .. toctree::
    :maxdepth: 2
 
-   DuelQLeapNet
-   DoubleDuelingRDQN
-   LeapNetEncoded
+   duelqleapnet
+   doubleduelingrdqn
+   leapnetencoded
+   external_contributions
 
 
 Deprecated baselines
@@ -49,7 +56,7 @@ Deprecated baselines
 .. toctree::
    :maxdepth: 2
 
-   SACOld
+   sacold
 
 
 Contributions
diff --git a/docs/LeapNetEncoded.rst b/docs/leapnetencoded.rst
similarity index 100%
rename from docs/LeapNetEncoded.rst
rename to docs/leapnetencoded.rst
diff --git a/docs/ppo_stable_baselines.rst b/docs/ppo_stable_baselines.rst
index f3f216e..1966963 100644
--- a/docs/ppo_stable_baselines.rst
+++ b/docs/ppo_stable_baselines.rst
@@ -22,7 +22,7 @@ You can use this class with:
 
 .. code-block:: python
 
-    from l2rpn_baselines.ppo_stablebaselines import train, evaluate, PPOSB_Agent
+    from l2rpn_baselines.PPO_SB3 import train, evaluate, PPO_SB3
 
 
 Create an agent from scratch
@@ -34,6 +34,7 @@ For example, to create an agent from scratch, with some parameters:
 
     import grid2op
     from grid2op.gym_compat import GymEnv, BoxGymActSpace
+    from l2rpn_baselines.PPO_SB3 import PPO_SB3
 
     # create the grid2op environment
     env = grid2op.make(...)
@@ -46,20 +47,20 @@ For example, to create an agent from scratch, with some parameters:
     #############
 
     # create the PPO Stable Baselines agent (only some basic configs are given here)
-    agent = PPOSB_Agent(env.action_space,
-                        env_gym.action_space,
-                        env_gym.observation_space,
-                        nn_kwargs={
-                            "policy": MlpPolicy,  # or any other stable baselines 3 policy
-                            "env": env_gym,
-                            "verbose": 1,  # or anything else
-                            "learning_rate": 3e-4,  # you can change that
-                            "policy_kwargs": {
-                                "net_arch": [100, 100, 100]  # and that
-                            }
-                        },
-                        nn_path=None
-                        )
+    agent = PPO_SB3(env.action_space,
+                    env_gym.action_space,
+                    env_gym.observation_space,
+                    nn_kwargs={
+                        "policy": MlpPolicy,  # or any other stable baselines 3 policy
+                        "env": env_gym,
+                        "verbose": 1,  # or anything else
+                        "learning_rate": 3e-4,  # you can change that
+                        "policy_kwargs": {
+                            "net_arch": [100, 100, 100]  # and that
+                        }
+                    },
+                    nn_path=None
+                   )
 
 .. note::
     The agent above is NOT trained. So it will basically output "random" actions.
@@ -76,6 +77,7 @@ in grid2game or any other frameworks related to grid2op.
 
     import grid2op
     from grid2op.gym_compat import GymEnv, BoxGymActSpace
+    from l2rpn_baselines.PPO_SB3 import PPO_SB3
 
     # create the grid2op environment
     env = grid2op.make(...)
@@ -88,28 +90,16 @@ in grid2game or any other frameworks related to grid2op.
     #############
 
     # create the PPO Stable Baselines agent (only some basic configs are given here)
-    agent = PPOSB_Agent(env.action_space,
-                        env_gym.action_space,
-                        env_gym.observation_space,
-                        nn_path=...  # path where you saved it !
-                        )
+    agent = PPO_SB3(env.action_space,
+                    env_gym.action_space,
+                    env_gym.observation_space,
+                    nn_path=...  # path where you saved it !
+                    )
 
 
 Detailed documentation
 ++++++++++++++++++++++++
 
-.. automodule:: l2rpn_baselines.ppo_stablebaselines
-    :members:
-    :autosummary:
-
-Other non exported class
-------------------------
-These classes need to be imported, if you want to import them with (non exhaustive list):
-.. code-block:: python
-
-    from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
-
-
-.. autoclass:: l2rpn_baselines.DeepQSimple.DeepQ_NN.DeepQ_NN
+.. automodule:: l2rpn_baselines.PPO_SB3
     :members:
     :autosummary:
diff --git a/docs/SACOld.rst b/docs/sacold.rst
similarity index 100%
rename from docs/SACOld.rst
rename to docs/sacold.rst
diff --git a/l2rpn_baselines/DeepQSimple/__init__.py b/l2rpn_baselines/DeepQSimple/__init__.py
index eed00f7..686ced0 100644
--- a/l2rpn_baselines/DeepQSimple/__init__.py
+++ b/l2rpn_baselines/DeepQSimple/__init__.py
@@ -5,7 +5,7 @@
     "DeepQ_NNParam"
 ]
 
-from l2rpn_baselines.DeepQSimple.DeepQSimple import DeepQSimple
+from l2rpn_baselines.DeepQSimple.deepQSimple import DeepQSimple
 from l2rpn_baselines.DeepQSimple.evaluate import evaluate
 from l2rpn_baselines.DeepQSimple.train import train
-from l2rpn_baselines.DeepQSimple.DeepQ_NNParam import DeepQ_NNParam
+from l2rpn_baselines.DeepQSimple.deepQ_NNParam import DeepQ_NNParam
diff --git a/l2rpn_baselines/DeepQSimple/DeepQSimple.py b/l2rpn_baselines/DeepQSimple/deepQSimple.py
similarity index 100%
rename from l2rpn_baselines/DeepQSimple/DeepQSimple.py
rename to l2rpn_baselines/DeepQSimple/deepQSimple.py
diff --git a/l2rpn_baselines/DeepQSimple/DeepQ_NN.py b/l2rpn_baselines/DeepQSimple/deepQ_NN.py
similarity index 96%
rename from l2rpn_baselines/DeepQSimple/DeepQ_NN.py
rename to l2rpn_baselines/DeepQSimple/deepQ_NN.py
index f4e069a..2405e38 100644
--- a/l2rpn_baselines/DeepQSimple/DeepQ_NN.py
+++ b/l2rpn_baselines/DeepQSimple/deepQ_NN.py
@@ -48,8 +48,6 @@ def __init__(self,
 
     def construct_q_network(self):
         """
-        The network architecture can be changed with the :attr:`l2rpn_baselines.BaseDeepQ.nn_archi`
-
         This function will make 2 identical models, one will serve as a target model, the other one will be trained
         regurlarly.
         """
diff --git a/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py b/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
similarity index 85%
rename from l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py
rename to l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
index 8a7c0b6..f7d136e 100644
--- a/l2rpn_baselines/DeepQSimple/DeepQ_NNParam.py
+++ b/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
@@ -8,14 +8,15 @@
 import copy
 
 from l2rpn_baselines.utils import NNParam
-from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
+from l2rpn_baselines.DeepQSimple.deepQ_NN import DeepQ_NN
 
 
 class DeepQ_NNParam(NNParam):
     """
-    This defined the specific parameters for the DeepQ network. Nothing really different compared to the base class
-    except that :attr:`l2rpn_baselines.utils.NNParam.nn_class` is :class:`DeepQ_NN`
-
+    This defined the specific parameters for the DeepQ network. 
+    
+    Nothing really different compared to the base class
+    except that :attr:`l2rpn_baselines.utils.NNParam.nn_class` (nn_class) is :class:`deepQ_NN.DeepQ_NN`
     """
     _int_attr = copy.deepcopy(NNParam._int_attr)
     _float_attr = copy.deepcopy(NNParam._float_attr)
@@ -39,4 +40,4 @@ def __init__(self,
                          sizes,
                          activs,
                          list_attr_obs
-                         )
\ No newline at end of file
+                         )
diff --git a/l2rpn_baselines/DeepQSimple/evaluate.py b/l2rpn_baselines/DeepQSimple/evaluate.py
index 0f098ba..6f2506a 100644
--- a/l2rpn_baselines/DeepQSimple/evaluate.py
+++ b/l2rpn_baselines/DeepQSimple/evaluate.py
@@ -14,9 +14,9 @@
 from grid2op.Runner import Runner
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.DeepQSimple.DeepQSimple import DeepQSimple, DEFAULT_NAME
-from l2rpn_baselines.DeepQSimple.DeepQ_NNParam import DeepQ_NNParam
-from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
+from l2rpn_baselines.DeepQSimple.deepQSimple import DeepQSimple, DEFAULT_NAME
+from l2rpn_baselines.DeepQSimple.deepQ_NNParam import DeepQ_NNParam
+from l2rpn_baselines.DeepQSimple.deepQ_NN import DeepQ_NN
 
 
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
@@ -36,7 +36,8 @@ def evaluate(env,
              save_gif=False,
              filter_action_fun=None):
     """
-    How to evaluate the performances of the trained DeepQSimple agent.
+    How to evaluate the performances of the trained :class:`DeepQSimple` agent.
+
 
     Parameters
     ----------
diff --git a/l2rpn_baselines/DeepQSimple/train.py b/l2rpn_baselines/DeepQSimple/train.py
index d76e3fd..7d1db21 100755
--- a/l2rpn_baselines/DeepQSimple/train.py
+++ b/l2rpn_baselines/DeepQSimple/train.py
@@ -12,9 +12,9 @@
 import warnings
 
 from l2rpn_baselines.utils import cli_train
-from l2rpn_baselines.DeepQSimple.DeepQSimple import DeepQSimple, DEFAULT_NAME
-from l2rpn_baselines.DeepQSimple.DeepQ_NNParam import DeepQ_NNParam
-from l2rpn_baselines.DeepQSimple.DeepQ_NN import DeepQ_NN
+from l2rpn_baselines.DeepQSimple.deepQSimple import DeepQSimple, DEFAULT_NAME
+from l2rpn_baselines.DeepQSimple.deepQ_NNParam import DeepQ_NNParam
+from l2rpn_baselines.DeepQSimple.deepQ_NN import DeepQ_NN
 from l2rpn_baselines.utils import TrainingParam
 from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
 
diff --git a/l2rpn_baselines/DoNothing/__init__.py b/l2rpn_baselines/DoNothing/__init__.py
index 2a6ee55..6d059c5 100644
--- a/l2rpn_baselines/DoNothing/__init__.py
+++ b/l2rpn_baselines/DoNothing/__init__.py
@@ -3,6 +3,6 @@
     "evaluate"
 ]
 
-from l2rpn_baselines.DoNothing.DoNothing import DoNothing
+from l2rpn_baselines.DoNothing.doNothing import DoNothing
 from l2rpn_baselines.DoNothing.eval_donothing import evaluate
 
diff --git a/l2rpn_baselines/DoNothing/DoNothing.py b/l2rpn_baselines/DoNothing/doNothing.py
similarity index 100%
rename from l2rpn_baselines/DoNothing/DoNothing.py
rename to l2rpn_baselines/DoNothing/doNothing.py
diff --git a/l2rpn_baselines/DoubleDuelingDQN/__init__.py b/l2rpn_baselines/DoubleDuelingDQN/__init__.py
index d309b42..8517ca1 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/__init__.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/__init__.py
@@ -5,7 +5,7 @@
     "train"
 ]
 
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN import DoubleDuelingDQN
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig import DoubleDuelingDQNConfig
 from l2rpn_baselines.DoubleDuelingDQN.evaluate import evaluate
 from l2rpn_baselines.DoubleDuelingDQN.train import train
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN.py
similarity index 99%
rename from l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py
rename to l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN.py
index ea88fa2..0bb58ef 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN.py
@@ -19,8 +19,8 @@
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN_NN import DoubleDuelingDQN_NN
 from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer
 
 
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py
similarity index 100%
rename from l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQNConfig.py
rename to l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py
diff --git a/l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py
similarity index 100%
rename from l2rpn_baselines/DoubleDuelingDQN/DoubleDuelingDQN_NN.py
rename to l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py
diff --git a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
index c0bbfc7..5f1269c 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/evaluate.py
@@ -16,8 +16,8 @@
 from grid2op.Reward import *
 from grid2op.Action import *
 
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN import DoubleDuelingDQN as D3QNAgent
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 
 DEFAULT_LOGS_DIR = "./logs-evals"
diff --git a/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py b/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
index 94763c7..0ac06f2 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/inspect_action_space.py
@@ -17,7 +17,7 @@
 from grid2op.MakeEnv import make2
 from grid2op.Action import PowerlineChangeAndDispatchAction
 
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN import DoubleDuelingDQN
 
 
 class NpEncoder(json.JSONEncoder):
diff --git a/l2rpn_baselines/DoubleDuelingDQN/train.py b/l2rpn_baselines/DoubleDuelingDQN/train.py
index c3f2bd8..6404469 100755
--- a/l2rpn_baselines/DoubleDuelingDQN/train.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/train.py
@@ -10,8 +10,8 @@
 
 import argparse
 
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN import DoubleDuelingDQN as D3QNAgent
-from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN import DoubleDuelingDQN as D3QNAgent
+from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig import DoubleDuelingDQNConfig as D3QNConfig
 
 DEFAULT_NAME = "DoubleDuelingDQN"
 DEFAULT_SAVE_DIR = "./models"
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py
index 085c33f..b6a6aa2 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/__init__.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/__init__.py
@@ -5,8 +5,8 @@
     "train"
 ]
 
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN import DoubleDuelingRDQN
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig import DoubleDuelingRDQNConfig
 from l2rpn_baselines.DoubleDuelingRDQN.evaluate import evaluate
 from l2rpn_baselines.DoubleDuelingRDQN.train import train
 
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
similarity index 98%
rename from l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py
rename to l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
index cda4405..602c9e9 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
@@ -20,9 +20,9 @@
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as cfg
-from l2rpn_baselines.DoubleDuelingRDQN.ExperienceBuffer import ExperienceBuffer
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as cfg
+from l2rpn_baselines.DoubleDuelingRDQN.experienceBuffer import ExperienceBuffer
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN_NN import DoubleDuelingRDQN_NN
 
 class DoubleDuelingRDQN(AgentWithConverter):
     def __init__(self,
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py
similarity index 100%
rename from l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQNConfig.py
rename to l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py
similarity index 100%
rename from l2rpn_baselines/DoubleDuelingRDQN/DoubleDuelingRDQN_NN.py
rename to l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
index e4b0166..66664f5 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
@@ -16,8 +16,8 @@
 from grid2op.Reward import *
 from grid2op.Action import *
 
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 
 DEFAULT_LOGS_DIR = "./logs-eval"
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py b/l2rpn_baselines/DoubleDuelingRDQN/experienceBuffer.py
similarity index 100%
rename from l2rpn_baselines/DoubleDuelingRDQN/ExperienceBuffer.py
rename to l2rpn_baselines/DoubleDuelingRDQN/experienceBuffer.py
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py
index 160823f..8831b6b 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/train.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py
@@ -14,8 +14,8 @@
 from grid2op.Reward import *
 from grid2op.Action import *
 
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig
-from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as RDQNConfig
+from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN import DoubleDuelingRDQN as RDQNAgent
 
 DEFAULT_NAME = "DoubleDuelingRDQN"
 DEFAULT_SAVE_DIR = "./models"
diff --git a/l2rpn_baselines/DuelQLeapNet/__init__.py b/l2rpn_baselines/DuelQLeapNet/__init__.py
index eb8c630..7727d56 100644
--- a/l2rpn_baselines/DuelQLeapNet/__init__.py
+++ b/l2rpn_baselines/DuelQLeapNet/__init__.py
@@ -5,7 +5,7 @@
     "DuelQLeapNet_NN"
 ]
 
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet import DuelQLeapNet
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet import DuelQLeapNet
 from l2rpn_baselines.DuelQLeapNet.evaluate import evaluate
 from l2rpn_baselines.DuelQLeapNet.train import train
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
diff --git a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet.py b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
similarity index 79%
rename from l2rpn_baselines/DuelQLeapNet/DuelQLeapNet.py
rename to l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
index bce88c2..b61351d 100644
--- a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet.py
+++ b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
@@ -8,14 +8,14 @@
 
 import numpy as np
 from l2rpn_baselines.utils import DeepQAgent
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
 
 DEFAULT_NAME = "DuelQLeapNet"
 
 
 class DuelQLeapNet(DeepQAgent):
     """
-    Inheriting from :class:`l2rpn_baselines.DeepQAgent` this class implements the  particular agent used for the
+    Inheriting from :class:`l2rpn_baselines.utils.deepQAgent.DeepQAgent` this class implements the  particular agent used for the
     Double Duelling Deep Q network baseline, with the particularity that the Q network is encoded with a leap net.
 
     It does nothing in particular.
diff --git a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
similarity index 100%
rename from l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
rename to l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
diff --git a/l2rpn_baselines/DuelQLeapNet/evaluate.py b/l2rpn_baselines/DuelQLeapNet/evaluate.py
index e8d6b00..4e4a11d 100644
--- a/l2rpn_baselines/DuelQLeapNet/evaluate.py
+++ b/l2rpn_baselines/DuelQLeapNet/evaluate.py
@@ -14,9 +14,9 @@
 from grid2op.Runner import Runner
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet import DuelQLeapNet, DEFAULT_NAME
-from l2rpn_baselines.DuelQLeapNet.LeapNet_NNParam import LeapNet_NNParam
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet import DuelQLeapNet, DEFAULT_NAME
+from l2rpn_baselines.DuelQLeapNet.leapNet_NNParam import LeapNet_NNParam
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
 
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
 DEFAULT_NB_EPISODE = 1
@@ -35,7 +35,7 @@ def evaluate(env,
              save_gif=False,
              filter_action_fun=None):
     """
-    How to evaluate the performances of the trained DeepQSimple agent.
+    How to evaluate the performances of the trained :class:`DuelQLeapNet` agent.
 
     Parameters
     ----------
@@ -70,7 +70,7 @@ def evaluate(env,
 
     Returns
     -------
-    agent: :class:`l2rpn_baselines.utils.DeepQAgent`
+    agent: :class:`DuelQLeapNet`
         The loaded agent that has been evaluated thanks to the runner.
 
     res: ``list``
@@ -79,7 +79,7 @@ def evaluate(env,
 
     Examples
     -------
-    You can evaluate a DeepQSimple this way:
+    You can evaluate a :class:`DuelQLeapNet` this way:
 
     .. code-block:: python
 
diff --git a/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py b/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
similarity index 98%
rename from l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py
rename to l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
index 10c1140..736a983 100644
--- a/l2rpn_baselines/DuelQLeapNet/LeapNet_NNParam.py
+++ b/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
@@ -9,7 +9,7 @@
 import copy
 
 from l2rpn_baselines.utils import NNParam
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
 
 
 class LeapNet_NNParam(NNParam):
diff --git a/l2rpn_baselines/DuelQLeapNet/train.py b/l2rpn_baselines/DuelQLeapNet/train.py
index 5fea200..e92bf1c 100755
--- a/l2rpn_baselines/DuelQLeapNet/train.py
+++ b/l2rpn_baselines/DuelQLeapNet/train.py
@@ -12,10 +12,10 @@
 import warnings
 
 from l2rpn_baselines.utils import cli_train
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet import DuelQLeapNet, DEFAULT_NAME
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet import DuelQLeapNet, DEFAULT_NAME
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
 from l2rpn_baselines.utils import TrainingParam
-from l2rpn_baselines.DuelQLeapNet.LeapNet_NNParam import LeapNet_NNParam
+from l2rpn_baselines.DuelQLeapNet.leapNet_NNParam import LeapNet_NNParam
 from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
 
 
@@ -31,7 +31,7 @@ def train(env,
           kwargs_converters={},
           kwargs_archi={}):
     """
-    This function implements the "training" part of the balines "DuelQLeapNet".
+    This function implements the "training" part of the balines :class:`DuelQLeapNet`.
 
     Parameters
     ----------
@@ -83,7 +83,7 @@ def train(env,
 
     Examples
     ---------
-    Here is an example on how to train a DuelQLeapNet baseline.
+    Here is an example on how to train a :class:`DuelQLeapNet` baseline.
 
     First define a python script, for example
 
diff --git a/l2rpn_baselines/DuelQSimple/__init__.py b/l2rpn_baselines/DuelQSimple/__init__.py
index a247e87..a189880 100644
--- a/l2rpn_baselines/DuelQSimple/__init__.py
+++ b/l2rpn_baselines/DuelQSimple/__init__.py
@@ -5,7 +5,7 @@
     "DuelQ_NNParam"
 ]
 
-from l2rpn_baselines.DuelQSimple.DuelQSimple import DuelQSimple
+from l2rpn_baselines.DuelQSimple.duelQSimple import DuelQSimple
 from l2rpn_baselines.DuelQSimple.evaluate import evaluate
 from l2rpn_baselines.DuelQSimple.train import train
-from l2rpn_baselines.DuelQSimple.DuelQ_NNParam import DuelQ_NNParam
+from l2rpn_baselines.DuelQSimple.duelQ_NNParam import DuelQ_NNParam
diff --git a/l2rpn_baselines/DuelQSimple/DuelQSimple.py b/l2rpn_baselines/DuelQSimple/duelQSimple.py
similarity index 100%
rename from l2rpn_baselines/DuelQSimple/DuelQSimple.py
rename to l2rpn_baselines/DuelQSimple/duelQSimple.py
diff --git a/l2rpn_baselines/DuelQSimple/DuelQ_NN.py b/l2rpn_baselines/DuelQSimple/duelQ_NN.py
similarity index 100%
rename from l2rpn_baselines/DuelQSimple/DuelQ_NN.py
rename to l2rpn_baselines/DuelQSimple/duelQ_NN.py
diff --git a/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py b/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
similarity index 95%
rename from l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py
rename to l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
index 87fdba0..8a40234 100644
--- a/l2rpn_baselines/DuelQSimple/DuelQ_NNParam.py
+++ b/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
@@ -9,7 +9,7 @@
 import copy
 
 from l2rpn_baselines.utils import NNParam
-from l2rpn_baselines.DuelQSimple.DuelQ_NN import DuelQ_NN
+from l2rpn_baselines.DuelQSimple.duelQ_NN import DuelQ_NN
 
 
 class DuelQ_NNParam(NNParam):
diff --git a/l2rpn_baselines/DuelQSimple/evaluate.py b/l2rpn_baselines/DuelQSimple/evaluate.py
index ba75c4f..7825394 100644
--- a/l2rpn_baselines/DuelQSimple/evaluate.py
+++ b/l2rpn_baselines/DuelQSimple/evaluate.py
@@ -14,9 +14,9 @@
 from grid2op.Runner import Runner
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.DuelQSimple.DuelQSimple import DuelQSimple, DEFAULT_NAME
-from l2rpn_baselines.DuelQSimple.DuelQ_NNParam import DuelQ_NNParam
-from l2rpn_baselines.DuelQSimple.DuelQ_NN import DuelQ_NN
+from l2rpn_baselines.DuelQSimple.duelQSimple import DuelQSimple, DEFAULT_NAME
+from l2rpn_baselines.DuelQSimple.duelQ_NNParam import DuelQ_NNParam
+from l2rpn_baselines.DuelQSimple.duelQ_NN import DuelQ_NN
 
 
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
diff --git a/l2rpn_baselines/DuelQSimple/train.py b/l2rpn_baselines/DuelQSimple/train.py
index 8c990b2..9caea6c 100755
--- a/l2rpn_baselines/DuelQSimple/train.py
+++ b/l2rpn_baselines/DuelQSimple/train.py
@@ -12,9 +12,9 @@
 import warnings
 
 from l2rpn_baselines.utils import cli_train
-from l2rpn_baselines.DuelQSimple.DuelQSimple import DuelQSimple, DEFAULT_NAME
-from l2rpn_baselines.DuelQSimple.DuelQ_NNParam import DuelQ_NNParam
-from l2rpn_baselines.DuelQSimple.DuelQ_NN import DuelQ_NN
+from l2rpn_baselines.DuelQSimple.duelQSimple import DuelQSimple, DEFAULT_NAME
+from l2rpn_baselines.DuelQSimple.duelQ_NNParam import DuelQ_NNParam
+from l2rpn_baselines.DuelQSimple.duelQ_NN import DuelQ_NN
 from l2rpn_baselines.utils import TrainingParam
 from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
 
diff --git a/l2rpn_baselines/ExpertAgent/__init__.py b/l2rpn_baselines/ExpertAgent/__init__.py
index bcaadf1..bb82d21 100644
--- a/l2rpn_baselines/ExpertAgent/__init__.py
+++ b/l2rpn_baselines/ExpertAgent/__init__.py
@@ -4,7 +4,7 @@
     "other_rewards"
 ]
 
-from l2rpn_baselines.ExpertAgent.ExpertAgent import ExpertAgent
+from l2rpn_baselines.ExpertAgent.expertAgent import ExpertAgent
 from l2rpn_baselines.ExpertAgent.evaluate import evaluate
-from l2rpn_baselines.ExpertAgent.ExpertAgent import other_rewards
+from l2rpn_baselines.ExpertAgent.expertAgent import other_rewards
 
diff --git a/l2rpn_baselines/ExpertAgent/evaluate.py b/l2rpn_baselines/ExpertAgent/evaluate.py
index 3c1ebad..5cad27e 100644
--- a/l2rpn_baselines/ExpertAgent/evaluate.py
+++ b/l2rpn_baselines/ExpertAgent/evaluate.py
@@ -19,7 +19,7 @@
 try:
     from l2rpn_baselines.ExpertAgent import ExpertAgent
     from l2rpn_baselines.utils.save_log_gif import save_log_gif
-    from l2rpn_baselines.ExpertAgent.ExpertAgent import other_rewards
+    from l2rpn_baselines.ExpertAgent.expertAgent import other_rewards
     _CAN_USE_EXPERT_BASELINE = True
 except ImportError as exc_:
     _CAN_USE_EXPERT_BASELINE = False
diff --git a/l2rpn_baselines/ExpertAgent/ExpertAgent.py b/l2rpn_baselines/ExpertAgent/expertAgent.py
similarity index 100%
rename from l2rpn_baselines/ExpertAgent/ExpertAgent.py
rename to l2rpn_baselines/ExpertAgent/expertAgent.py
diff --git a/l2rpn_baselines/LeapNetEncoded/__init__.py b/l2rpn_baselines/LeapNetEncoded/__init__.py
index c801db8..ba2e5ae 100644
--- a/l2rpn_baselines/LeapNetEncoded/__init__.py
+++ b/l2rpn_baselines/LeapNetEncoded/__init__.py
@@ -5,7 +5,7 @@
     "LeapNetEncoded_NN"
 ]
 
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded import LeapNetEncoded
 from l2rpn_baselines.LeapNetEncoded.evaluate import evaluate
 from l2rpn_baselines.LeapNetEncoded.train import train
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py
index 5f856db..158019b 100644
--- a/l2rpn_baselines/LeapNetEncoded/evaluate.py
+++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py
@@ -14,9 +14,9 @@
 from grid2op.Runner import Runner
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
 
 import pdb
 
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
similarity index 78%
rename from l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py
rename to l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
index 6c43364..81664bb 100644
--- a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
@@ -14,8 +14,10 @@
 
 class LeapNetEncoded(DeepQAgent):
     """
-    Inheriting from :class:`l2rpn_baselines.DeepQAgent` this class implements the  particular agent used for the
-    Double Duelling Deep Q network baseline, with the particularity that the Q network is encoded with a leap net.
+    Inheriting from :class:`l2rpn_baselines.utils.deepQAgent.DeepQAgent` 
+    this class implements the  particular agent used for the
+    Double Duelling Deep Q network baseline, with the particularity 
+    that the Q network is encoded with a leap net.
 
     It does nothing in particular.
     """
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
similarity index 98%
rename from l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
rename to l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
index 3f8e347..c158699 100644
--- a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
@@ -27,7 +27,7 @@
     _CAN_USE_TENSORFLOW = False
     
 from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
-from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import LtauBis
+from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import LtauBis
 
 
 class LeapNetEncoded_NN(BaseDeepQ):
@@ -43,7 +43,7 @@ class LeapNetEncoded_NN(BaseDeepQ):
     - a q network, that uses the output of the state encoder to predict which action is best.
 
     The Q network can have other types of input, and can also be a leap net, see the class
-    :class:`l2rpn_baselines.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam` for more information
+    :class:`l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam.LeapNetEncoded_NNParam` for more information
 
     """
     def __init__(self,
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
similarity index 98%
rename from l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py
rename to l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
index ee4a300..bb67d97 100644
--- a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
@@ -10,12 +10,12 @@
 import copy
 
 from l2rpn_baselines.utils import NNParam
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
 
 
 class LeapNetEncoded_NNParam(NNParam):
     """
-    This class implements the type of parameters used by the DuelQLeapNet model.
+    This class implements the type of parameters used by the :class:`LeapNetEncoded` model.
 
     More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
 
diff --git a/l2rpn_baselines/LeapNetEncoded/study.py b/l2rpn_baselines/LeapNetEncoded/study.py
index 57c2318..7059a8c 100644
--- a/l2rpn_baselines/LeapNetEncoded/study.py
+++ b/l2rpn_baselines/LeapNetEncoded/study.py
@@ -14,9 +14,9 @@
 
 from grid2op.MakeEnv import make
 
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
 
 import pdb
 
@@ -27,14 +27,14 @@
 
 
 def study(env,
-             name=DEFAULT_NAME,
-             load_path=None,
-             logs_path=DEFAULT_LOGS_DIR,
-             nb_episode=DEFAULT_NB_EPISODE,
-             nb_process=DEFAULT_NB_PROCESS,
-             max_steps=DEFAULT_MAX_STEPS,
-             verbose=False,
-             save_gif=False):
+          name=DEFAULT_NAME,
+          load_path=None,
+          logs_path=DEFAULT_LOGS_DIR,
+          nb_episode=DEFAULT_NB_EPISODE,
+          nb_process=DEFAULT_NB_PROCESS,
+          max_steps=DEFAULT_MAX_STEPS,
+          verbose=False,
+          save_gif=False):
     """study the prediction of the grid_model"""
 
     import tensorflow as tf
diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py
index bf0ceac..b48092e 100755
--- a/l2rpn_baselines/LeapNetEncoded/train.py
+++ b/l2rpn_baselines/LeapNetEncoded/train.py
@@ -12,10 +12,10 @@
 import warnings
 
 from l2rpn_baselines.utils import cli_train
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
 from l2rpn_baselines.utils import TrainingParam
-from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam import LeapNetEncoded_NNParam
 from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
 
 
@@ -31,9 +31,8 @@ def train(env,
           kwargs_converters={},
           kwargs_archi={}):
     """
-    This function implements the "training" part of the baselines "SAC". This is the "old" implementation
-    that most likely had bugs. We keep it here for backward compatibility, but it is not recommended to
-    use it on new projects.
+    This function implements the "training" part of the baselines :class:`LeapNetEncoded`.
+    
 
     Parameters
     ----------
@@ -56,7 +55,7 @@ def train(env,
     logs_dir: ``str``
         Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
 
-    training_param: :class:`l2rpn_baselines.utils.TrainingParam`
+    training_param: :class:`l2rpn_baselines.utils.trainingParam.TrainingParam`
         The parameters describing the way you will train your model.
 
     filter_action_fun: ``function``
@@ -77,7 +76,7 @@ def train(env,
     Returns
     -------
 
-    baseline: :class:`DuelQLeapNet`
+    baseline: :class:`LeapNetEncoded``
         The trained baseline.
 
 
@@ -85,7 +84,7 @@ def train(env,
 
     Examples
     ---------
-    Here is an example on how to train a DuelQLeapNet baseline.
+    Here is an example on how to train a :class:`LeapNetEncoded` baseline.
 
     First define a python script, for example
 
diff --git a/l2rpn_baselines/PPO_SB3/__init__.py b/l2rpn_baselines/PPO_SB3/__init__.py
new file mode 100644
index 0000000..aef1732
--- /dev/null
+++ b/l2rpn_baselines/PPO_SB3/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+__all__ = [
+    "evaluate",
+    "train",
+    "PPO_SB3"
+]
+
+from l2rpn_baselines.PPO_SB3.utils import SB3Agent as PPO_SB3
+from l2rpn_baselines.PPO_SB3.evaluate import evaluate
+from l2rpn_baselines.PPO_SB3.train import train
diff --git a/l2rpn_baselines/ppo_stablebaselines/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
similarity index 99%
rename from l2rpn_baselines/ppo_stablebaselines/evaluate.py
rename to l2rpn_baselines/PPO_SB3/evaluate.py
index f414f7d..eb440e0 100644
--- a/l2rpn_baselines/ppo_stablebaselines/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -14,7 +14,7 @@
 
 from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace
 
-from l2rpn_baselines.ppo_stablebaselines.utils import SB3Agent
+from l2rpn_baselines.PPO_SB3.utils import SB3Agent
 
 
 def evaluate(env,
diff --git a/l2rpn_baselines/ppo_stablebaselines/train.py b/l2rpn_baselines/PPO_SB3/train.py
similarity index 100%
rename from l2rpn_baselines/ppo_stablebaselines/train.py
rename to l2rpn_baselines/PPO_SB3/train.py
diff --git a/l2rpn_baselines/ppo_stablebaselines/utils.py b/l2rpn_baselines/PPO_SB3/utils.py
similarity index 100%
rename from l2rpn_baselines/ppo_stablebaselines/utils.py
rename to l2rpn_baselines/PPO_SB3/utils.py
diff --git a/l2rpn_baselines/SliceRDQN/__init__.py b/l2rpn_baselines/SliceRDQN/__init__.py
index b6e9fa3..b403c07 100644
--- a/l2rpn_baselines/SliceRDQN/__init__.py
+++ b/l2rpn_baselines/SliceRDQN/__init__.py
@@ -5,8 +5,8 @@
     "train"
 ]
 
-from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN
-from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config
+from l2rpn_baselines.SliceRDQN.sliceRDQN import SliceRDQN
+from l2rpn_baselines.SliceRDQN.sliceRDQN_Config import SliceRDQN_Config
 from l2rpn_baselines.SliceRDQN.evaluate import evaluate
 from l2rpn_baselines.SliceRDQN.train import train
 
diff --git a/l2rpn_baselines/SliceRDQN/evaluate.py b/l2rpn_baselines/SliceRDQN/evaluate.py
index daa925e..fdb7655 100755
--- a/l2rpn_baselines/SliceRDQN/evaluate.py
+++ b/l2rpn_baselines/SliceRDQN/evaluate.py
@@ -12,7 +12,7 @@
 import argparse
 
 from grid2op.MakeEnv import make
-from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent
+from l2rpn_baselines.SliceRDQN.sliceRDQN import SliceRDQN as RDQNAgent
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 
 DEFAULT_LOGS_DIR = "./logs-eval"
diff --git a/l2rpn_baselines/SliceRDQN/ExperienceBuffer.py b/l2rpn_baselines/SliceRDQN/experienceBuffer.py
similarity index 100%
rename from l2rpn_baselines/SliceRDQN/ExperienceBuffer.py
rename to l2rpn_baselines/SliceRDQN/experienceBuffer.py
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN.py b/l2rpn_baselines/SliceRDQN/sliceRDQN.py
similarity index 98%
rename from l2rpn_baselines/SliceRDQN/SliceRDQN.py
rename to l2rpn_baselines/SliceRDQN/sliceRDQN.py
index 5aaeb7f..827e25e 100644
--- a/l2rpn_baselines/SliceRDQN/SliceRDQN.py
+++ b/l2rpn_baselines/SliceRDQN/sliceRDQN.py
@@ -20,9 +20,9 @@
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
-from l2rpn_baselines.SliceRDQN.ExperienceBuffer import ExperienceBuffer
-from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as cfg
-from l2rpn_baselines.SliceRDQN.SliceRDQN_NN import SliceRDQN_NN
+from l2rpn_baselines.SliceRDQN.experienceBuffer import ExperienceBuffer
+from l2rpn_baselines.SliceRDQN.sliceRDQN_Config import SliceRDQN_Config as cfg
+from l2rpn_baselines.SliceRDQN.sliceRDQN_NN import SliceRDQN_NN
 from l2rpn_baselines.SliceRDQN.slice_util import *
 
 
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py b/l2rpn_baselines/SliceRDQN/sliceRDQN_Config.py
similarity index 100%
rename from l2rpn_baselines/SliceRDQN/SliceRDQN_Config.py
rename to l2rpn_baselines/SliceRDQN/sliceRDQN_Config.py
diff --git a/l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py b/l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py
similarity index 100%
rename from l2rpn_baselines/SliceRDQN/SliceRDQN_NN.py
rename to l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py
diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py
index 5539426..b6326c6 100755
--- a/l2rpn_baselines/SliceRDQN/train.py
+++ b/l2rpn_baselines/SliceRDQN/train.py
@@ -13,8 +13,8 @@
 from grid2op.MakeEnv import make
 from grid2op.Parameters import Parameters
 
-from l2rpn_baselines.SliceRDQN.SliceRDQN import SliceRDQN as RDQNAgent
-from l2rpn_baselines.SliceRDQN.SliceRDQN_Config import SliceRDQN_Config as RDQNConfig
+from l2rpn_baselines.SliceRDQN.sliceRDQN import SliceRDQN as RDQNAgent
+from l2rpn_baselines.SliceRDQN.sliceRDQN_Config import SliceRDQN_Config as RDQNConfig
 
 DEFAULT_NAME = "SliceRDQN"
 DEFAULT_SAVE_DIR = "./models"
diff --git a/l2rpn_baselines/Template/__init__.py b/l2rpn_baselines/Template/__init__.py
index 7411c9f..6d959fe 100644
--- a/l2rpn_baselines/Template/__init__.py
+++ b/l2rpn_baselines/Template/__init__.py
@@ -4,7 +4,7 @@
     "train"
 ]
 
-from l2rpn_baselines.Template.Template import Template
+from l2rpn_baselines.Template.template import Template
 from l2rpn_baselines.Template.evaluate import evaluate
 from l2rpn_baselines.Template.train import train
 
@@ -14,7 +14,7 @@
 in this __init__.py file:
 
 - `XXX` [**mandatory**] contains the definition of your baseline. It must follow the directives
-   given in "Template.py"
+   given in "template.py" (or "Template.py" at your convenience)
 - `evaluate` [**mandatory**] contains the script to evaluate the performance of this baseline. It must
   follow the directive in "evaluate.py"
 - `train` [**optional**] contains the script to train your baseline. If provided, it must follow
diff --git a/l2rpn_baselines/Template/evaluate.py b/l2rpn_baselines/Template/evaluate.py
index f5c1da0..5361e54 100755
--- a/l2rpn_baselines/Template/evaluate.py
+++ b/l2rpn_baselines/Template/evaluate.py
@@ -11,7 +11,7 @@
 import os
 from grid2op.Runner import Runner
 
-from l2rpn_baselines.Template.Template import Template
+from l2rpn_baselines.Template.template import Template
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
 
 
diff --git a/l2rpn_baselines/Template/Template.py b/l2rpn_baselines/Template/template.py
similarity index 100%
rename from l2rpn_baselines/Template/Template.py
rename to l2rpn_baselines/Template/template.py
diff --git a/l2rpn_baselines/Template/train.py b/l2rpn_baselines/Template/train.py
index c30ab21..b801120 100755
--- a/l2rpn_baselines/Template/train.py
+++ b/l2rpn_baselines/Template/train.py
@@ -9,7 +9,7 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 
-from l2rpn_baselines.Template.Template import Template
+from l2rpn_baselines.Template.template import Template
 
 
 def train(env,
diff --git a/l2rpn_baselines/ppo_stablebaselines/__init__.py b/l2rpn_baselines/ppo_stablebaselines/__init__.py
deleted file mode 100644
index dbf9799..0000000
--- a/l2rpn_baselines/ppo_stablebaselines/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-__all__ = [
-    "evaluate",
-    "train",
-    "PPOSB_Agent"
-]
-
-from l2rpn_baselines.ppo_stablebaselines.utils import SB3Agent as PPOSB_Agent
-from l2rpn_baselines.ppo_stablebaselines.evaluate import evaluate
-from l2rpn_baselines.ppo_stablebaselines.train import train
diff --git a/l2rpn_baselines/test/test_import.py b/l2rpn_baselines/test/test_import.py
index 8dbb686..7debb31 100644
--- a/l2rpn_baselines/test/test_import.py
+++ b/l2rpn_baselines/test/test_import.py
@@ -16,7 +16,7 @@ class TestImport(object):
     def test_import(self):
         module_name = self.load_module()
         exec(f"import l2rpn_baselines.{module_name}")
-        exec(f"import l2rpn_baselines.{module_name}.{module_name}")
+        exec(f"from l2rpn_baselines.{module_name} import {module_name}")
         exec(f"from l2rpn_baselines.{module_name} import evaluate")
         assert 1+1 == 2
 
@@ -85,6 +85,10 @@ class TestExpertAgent(TestImport, unittest.TestCase):
     def load_module(self):
         return "ExpertAgent"
 
+class TestPPOSB3(TestImport, unittest.TestCase):
+    def load_module(self):
+        return "PPO_SB3"
+
 
 # because it deactivates the eager mode
 # class TestPandapowerGeirina(TestImport, unittest.TestCase):
diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index f3ea131..06cdd77 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -9,6 +9,7 @@
 
 __all__ = [
     "cli_eval",
+<<<<<<< HEAD
            "cli_train",
            "str2bool",
            "save_log_gif",
@@ -21,19 +22,39 @@
            "BaseDeepQ",
            "DeepQAgent",
            "GymAgent"
+=======
+    "cli_train",
+    "str2bool",
+    "save_log_gif",
+    "make_multi_env",
+    "train_generic",
+    "TrainingParam",
+    "NNParam",
+    "ReplayBuffer",
+    "BaseDeepQ",
+    "DeepQAgent",
+    "GymAgent"
+>>>>>>> branch_with_zips
 ]
 
 from l2rpn_baselines.utils.cli_eval import cli_eval
 from l2rpn_baselines.utils.cli_train import cli_train
 from l2rpn_baselines.utils.str2bool import str2bool
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.utils.zip_for_codalab import zip_for_codalab
 from l2rpn_baselines.utils.train_generic import train_generic
 from l2rpn_baselines.utils.make_multi_env import make_multi_env
 
+<<<<<<< HEAD
 from l2rpn_baselines.utils.TrainingParam import TrainingParam
 from l2rpn_baselines.utils.NNParam import NNParam
 from l2rpn_baselines.utils.ReplayBuffer import ReplayBuffer
 from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ
 from l2rpn_baselines.utils.DeepQAgent import DeepQAgent
+=======
+from l2rpn_baselines.utils.trainingParam import TrainingParam
+from l2rpn_baselines.utils.nnParam import NNParam
+from l2rpn_baselines.utils.replayBuffer import ReplayBuffer
+from l2rpn_baselines.utils.baseDeepQ import BaseDeepQ
+from l2rpn_baselines.utils.deepQAgent import DeepQAgent
+>>>>>>> branch_with_zips
 from l2rpn_baselines.utils.gymAgent import GymAgent
diff --git a/l2rpn_baselines/utils/BaseDeepQ.py b/l2rpn_baselines/utils/baseDeepQ.py
similarity index 99%
rename from l2rpn_baselines/utils/BaseDeepQ.py
rename to l2rpn_baselines/utils/baseDeepQ.py
index 5db864f..15f8f33 100644
--- a/l2rpn_baselines/utils/BaseDeepQ.py
+++ b/l2rpn_baselines/utils/baseDeepQ.py
@@ -20,7 +20,7 @@
     
 
 
-from l2rpn_baselines.utils.TrainingParam import TrainingParam
+from l2rpn_baselines.utils.trainingParam import TrainingParam
 
 
 # refactorization of the code in a base class to avoid copy paste.
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/deepQAgent.py
similarity index 99%
rename from l2rpn_baselines/utils/DeepQAgent.py
rename to l2rpn_baselines/utils/deepQAgent.py
index 2a76764..679e6b9 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/deepQAgent.py
@@ -11,13 +11,12 @@
 import numpy as np
 from tqdm import tqdm
 
-import grid2op
 from grid2op.Exceptions import Grid2OpException
 from grid2op.Agent import AgentWithConverter
 from grid2op.Converter import IdToAct
 
-from l2rpn_baselines.utils.ReplayBuffer import ReplayBuffer
-from l2rpn_baselines.utils.TrainingParam import TrainingParam
+from l2rpn_baselines.utils.replayBuffer import ReplayBuffer
+from l2rpn_baselines.utils.trainingParam import TrainingParam
 
 try:
     from grid2op.Chronics import MultifolderWithCache
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 3ff8cd8..b29eaac 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -7,11 +7,16 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 from abc import abstractmethod
+<<<<<<< HEAD
+
+from grid2op.Agent import BaseAgent
+=======
 import copy
 
 from grid2op.Agent import BaseAgent
 from grid2op.Observation import BaseObservation
 from grid2op.Action import BaseAction
+>>>>>>> branch_with_zips
 
 
 class GymAgent(BaseAgent):
@@ -24,6 +29,17 @@ class GymAgent(BaseAgent):
 
     Use it only with a trained agent. It does not provide the "save" method and
     is not suitable for training.
+<<<<<<< HEAD
+    """
+    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path):
+        super().__init__(g2op_action_space)
+        self._gym_act_space = gym_act_space
+        self._gym_obs_space = gym_obs_space
+        self._nn_path = nn_path
+        self.nn_model = None
+        self.load()
+
+=======
     
     ..info::
         To load a previously saved agent the function `GymAgent.load` will be called
@@ -69,6 +85,7 @@ def __init__(self,
         else:
             self.build()
             
+>>>>>>> branch_with_zips
     @abstractmethod
     def get_act(self, gym_obs, reward, done):
         """
@@ -79,6 +96,13 @@ def get_act(self, gym_obs, reward, done):
     @abstractmethod
     def load(self):
         """
+<<<<<<< HEAD
+        Load the NN models
+        """
+        pass
+
+    def act(self, observation, reward, done):
+=======
         Load the NN model
         
         ..info:: Only called if the agent has been build with `nn_path` not None and `nn_kwargs=None`
@@ -120,6 +144,7 @@ def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAc
         
         In this case the "gym agent" will only be used in particular settings.
         """
+>>>>>>> branch_with_zips
         gym_obs = self._gym_obs_space.to_gym(observation)
         gym_act = self.get_act(gym_obs, reward, done)
         grid2op_act = self._gym_act_space.from_gym(gym_act)
diff --git a/l2rpn_baselines/utils/NNParam.py b/l2rpn_baselines/utils/nnParam.py
similarity index 99%
rename from l2rpn_baselines/utils/NNParam.py
rename to l2rpn_baselines/utils/nnParam.py
index 3b9e350..6a7c762 100644
--- a/l2rpn_baselines/utils/NNParam.py
+++ b/l2rpn_baselines/utils/nnParam.py
@@ -11,7 +11,7 @@
 from collections.abc import Iterable
 
 import grid2op
-from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ
+from l2rpn_baselines.utils.baseDeepQ import BaseDeepQ
 
 
 class NNParam(object):
diff --git a/l2rpn_baselines/utils/ReplayBuffer.py b/l2rpn_baselines/utils/replayBuffer.py
similarity index 100%
rename from l2rpn_baselines/utils/ReplayBuffer.py
rename to l2rpn_baselines/utils/replayBuffer.py
diff --git a/l2rpn_baselines/utils/RLAgent.py b/l2rpn_baselines/utils/rlAgent.py
similarity index 95%
rename from l2rpn_baselines/utils/RLAgent.py
rename to l2rpn_baselines/utils/rlAgent.py
index aa1d2d4..044a91b 100644
--- a/l2rpn_baselines/utils/RLAgent.py
+++ b/l2rpn_baselines/utils/rlAgent.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 from grid2op.Agent import BaseAgent
+from l2rpn_baselines.utils.trainingParam import TrainingParam
 
 
 class RLAgent(BaseAgent):
@@ -79,4 +80,4 @@ def load(self, path):
         for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]:
             conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
             if os.path.exists(conv_path):
-                setattr(self, nm_attr, np.load(file=conv_path))
\ No newline at end of file
+                setattr(self, nm_attr, np.load(file=conv_path))
diff --git a/l2rpn_baselines/utils/TrainingParam.py b/l2rpn_baselines/utils/trainingParam.py
similarity index 100%
rename from l2rpn_baselines/utils/TrainingParam.py
rename to l2rpn_baselines/utils/trainingParam.py

From 414e3d31a5a6be71ce62a95e773836d1442866f7 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 17:42:19 +0100
Subject: [PATCH 08/56] fix remaining conflicts

---
 l2rpn_baselines/utils/__init__.py | 23 -----------------------
 l2rpn_baselines/utils/gymAgent.py | 25 -------------------------
 2 files changed, 48 deletions(-)

diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index 06cdd77..4f6ee00 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -9,20 +9,6 @@
 
 __all__ = [
     "cli_eval",
-<<<<<<< HEAD
-           "cli_train",
-           "str2bool",
-           "save_log_gif",
-           "make_multi_env",
-           "zip_for_codalab",
-           "train_generic",
-           "TrainingParam",
-           "NNParam",
-           "ReplayBuffer",
-           "BaseDeepQ",
-           "DeepQAgent",
-           "GymAgent"
-=======
     "cli_train",
     "str2bool",
     "save_log_gif",
@@ -34,7 +20,6 @@
     "BaseDeepQ",
     "DeepQAgent",
     "GymAgent"
->>>>>>> branch_with_zips
 ]
 
 from l2rpn_baselines.utils.cli_eval import cli_eval
@@ -44,17 +29,9 @@
 from l2rpn_baselines.utils.train_generic import train_generic
 from l2rpn_baselines.utils.make_multi_env import make_multi_env
 
-<<<<<<< HEAD
-from l2rpn_baselines.utils.TrainingParam import TrainingParam
-from l2rpn_baselines.utils.NNParam import NNParam
-from l2rpn_baselines.utils.ReplayBuffer import ReplayBuffer
-from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ
-from l2rpn_baselines.utils.DeepQAgent import DeepQAgent
-=======
 from l2rpn_baselines.utils.trainingParam import TrainingParam
 from l2rpn_baselines.utils.nnParam import NNParam
 from l2rpn_baselines.utils.replayBuffer import ReplayBuffer
 from l2rpn_baselines.utils.baseDeepQ import BaseDeepQ
 from l2rpn_baselines.utils.deepQAgent import DeepQAgent
->>>>>>> branch_with_zips
 from l2rpn_baselines.utils.gymAgent import GymAgent
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index b29eaac..3ff8cd8 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -7,16 +7,11 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 from abc import abstractmethod
-<<<<<<< HEAD
-
-from grid2op.Agent import BaseAgent
-=======
 import copy
 
 from grid2op.Agent import BaseAgent
 from grid2op.Observation import BaseObservation
 from grid2op.Action import BaseAction
->>>>>>> branch_with_zips
 
 
 class GymAgent(BaseAgent):
@@ -29,17 +24,6 @@ class GymAgent(BaseAgent):
 
     Use it only with a trained agent. It does not provide the "save" method and
     is not suitable for training.
-<<<<<<< HEAD
-    """
-    def __init__(self, g2op_action_space, gym_act_space, gym_obs_space, nn_path):
-        super().__init__(g2op_action_space)
-        self._gym_act_space = gym_act_space
-        self._gym_obs_space = gym_obs_space
-        self._nn_path = nn_path
-        self.nn_model = None
-        self.load()
-
-=======
     
     ..info::
         To load a previously saved agent the function `GymAgent.load` will be called
@@ -85,7 +69,6 @@ def __init__(self,
         else:
             self.build()
             
->>>>>>> branch_with_zips
     @abstractmethod
     def get_act(self, gym_obs, reward, done):
         """
@@ -96,13 +79,6 @@ def get_act(self, gym_obs, reward, done):
     @abstractmethod
     def load(self):
         """
-<<<<<<< HEAD
-        Load the NN models
-        """
-        pass
-
-    def act(self, observation, reward, done):
-=======
         Load the NN model
         
         ..info:: Only called if the agent has been build with `nn_path` not None and `nn_kwargs=None`
@@ -144,7 +120,6 @@ def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAc
         
         In this case the "gym agent" will only be used in particular settings.
         """
->>>>>>> branch_with_zips
         gym_obs = self._gym_obs_space.to_gym(observation)
         gym_act = self.get_act(gym_obs, reward, done)
         grid2op_act = self._gym_act_space.from_gym(gym_act)

From 4b6d35fc57a2d99bbd77f0b35d7ff2ac6d519498 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 17:48:48 +0100
Subject: [PATCH 09/56] fix references in doc for doubleduelingdqn

---
 docs/doubleduelingdqn.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/doubleduelingdqn.rst b/docs/doubleduelingdqn.rst
index 2420a82..5ac6a50 100644
--- a/docs/doubleduelingdqn.rst
+++ b/docs/doubleduelingdqn.rst
@@ -21,7 +21,7 @@ You can use this class with:
     from l2rpn_baselines.DoubleDuelingDQN import train
     from l2rpn_baselines.DoubleDuelingDQN import evaluate
 
-.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN
+.. automodule:: l2rpn_baselines.DoubleDuelingDQN
     :members:
     :autosummary:
 
@@ -39,7 +39,7 @@ Training a model requires tweaking many hyperparameters, these can be found in a
     DoubleDuelingDQNConfig.FINAL_EPSILON = 0.001
     DoubleDuelingDQNConfig.DECAY_EPSILON = 10000
 
-.. automodule:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQNConfig
+.. automodule:: l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig.DoubleDuelingDQNConfig
     :members:
     :undoc-members:
 
@@ -50,9 +50,9 @@ You may want to import it manually:
 
 .. code-block:: python
 
-    from l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN import DoubleDuelingDQN_NN
+    from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN_NN import DoubleDuelingDQN_NN
 
 
-.. autoclass:: l2rpn_baselines.DoubleDuelingDQN.DoubleDuelingDQN_NN.DoubleDuelingDQN_NN
+.. autoclass:: l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN_NN.DoubleDuelingDQN_NN
     :members:
     :autosummary:

From 3530161278a4746f975b0e45141d213ded8bdc42 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 17:51:23 +0100
Subject: [PATCH 10/56] fix references in doc for doubleduelingrdqn

---
 docs/doubleduelingrdqn.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/doubleduelingrdqn.rst b/docs/doubleduelingrdqn.rst
index b1e280f..143bd40 100644
--- a/docs/doubleduelingrdqn.rst
+++ b/docs/doubleduelingrdqn.rst
@@ -21,7 +21,7 @@ You can use this class with:
     from l2rpn_baselines.DoubleDuelingRDQN import train
     from l2rpn_baselines.DoubleDuelingRDQN import evaluate
 
-.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN
+.. automodule:: l2rpn_baselines.DoubleDuelingRDQN
     :members:
     :autosummary:
 
@@ -37,7 +37,7 @@ Training a model requires tweaking many hyperparameters, these can be found in a
     DoubleDuelingRDQNConfig.LR = 1e-5
     DoubleDuelingRDQNConfig.TRACE_LENGTH = 12
 
-.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQNConfig
+.. automodule:: l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig.DoubleDuelingRDQNConfig
     :members:
     :undoc-members:
 
@@ -48,9 +48,9 @@ You may want to import it manually:
 
 .. code-block:: python
 
-    from l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN import DoubleDuelingRDQN_NN
+    from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN_NN import DoubleDuelingRDQN_NN
 
 
-.. autoclass:: l2rpn_baselines.DoubleDuelingRDQN.DoubleDuelingRDQN_NN.DoubleDuelingRDQN_NN
+.. autoclass:: l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN_NN.DoubleDuelingRDQN_NN
     :members:
     :autosummary:

From bce5c083303e9ca0124f8c37299196b2c6d16f9e Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 17:59:46 +0100
Subject: [PATCH 11/56] fix references in doc for duelqleapnet

---
 docs/duelqleapnet.rst                           |  9 +++++----
 docs/index.rst                                  |  7 ++++++-
 l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py | 12 ++++++++----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/docs/duelqleapnet.rst b/docs/duelqleapnet.rst
index 59f614d..cce7bcf 100644
--- a/docs/duelqleapnet.rst
+++ b/docs/duelqleapnet.rst
@@ -32,16 +32,17 @@ You can use this class with:
 Other non exported class
 ------------------------
 These classes need to be imported, if you want to import them with (non exhaustive list):
+
 .. code-block:: python
 
-    from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import DuelQLeapNet_NN
-    from l2rpn_baselines.DuelQLeapNet.LeapNet_NNParam import LeapNet_NNParam
+    from l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN import DuelQLeapNet_NN
+    from l2rpn_baselines.DuelQLeapNet.leapNet_NNParam import LeapNet_NNParam
 
 
-.. autoclass:: l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN.DuelQLeapNet_NN
+.. autoclass:: l2rpn_baselines.DuelQLeapNet.duelQLeapNet_NN.DuelQLeapNet_NN
     :members:
     :autosummary:
 
-.. autoclass:: l2rpn_baselines.DuelQLeapNet.LeapNet_NNParam.LeapNet_NNParam
+.. autoclass:: l2rpn_baselines.DuelQLeapNet.leapNet_NNParam.LeapNet_NNParam
     :members:
     :autosummary:
diff --git a/docs/index.rst b/docs/index.rst
index 143c7a6..57f036f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,8 +3,9 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
+============================================
 Welcome to l2rpn-baselines's documentation!
-===========================================
+============================================
 
 In this documentation we expose first what is this package about and how to contribute, and then which baselines
 are already implemented in this package.
@@ -14,6 +15,7 @@ How to contribute
 
 .. toctree::
    :maxdepth: 2
+   :caption: How to contribute
 
    template
    donothing
@@ -29,6 +31,7 @@ baseline with hyper parameters tuned that performs correctly.
 
 .. toctree::
    :maxdepth: 2
+   :caption: Reference baselines
 
    utils
    deepqsimple
@@ -43,6 +46,7 @@ More advanced baselines
 
 .. toctree::
    :maxdepth: 2
+   :caption: More advanced baselines and Contributions
 
    duelqleapnet
    doubleduelingrdqn
@@ -55,6 +59,7 @@ Deprecated baselines
 
 .. toctree::
    :maxdepth: 2
+   :caption: Deprecated baselines
 
    sacold
 
diff --git a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
index 2b35cc8..a0f8ff8 100644
--- a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
+++ b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
@@ -115,10 +115,14 @@ def __init__(self,
 
     def construct_q_network(self):
         """
-        First the :attr:`l2rpn_baselines.BaseDeepQ.nn_archi` parameters are used to create a neural network
-        to 'encode' the data. Then the leaps occur.
-
-        Afterward the model is split into value an advantage, and treated as usually in any D3QN.
+        Build the Q network appropriatly.
+        
+        It first build a standard Q network with regular inputs x.
+        
+        Then encodes the tau
+        
+        Then data are split and used in the "value" and the "advantage" networks as
+        done usually in D3QN.
 
         """
         # Uses the network architecture found in DeepMind paper

From 6621f97b1e3e65ea4efb52f4c8fa44930b2e5fc3 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 18:02:42 +0100
Subject: [PATCH 12/56] fix references in doc for duelqsimple

---
 docs/duelqsimple.rst                       | 11 +++++++++--
 l2rpn_baselines/DuelQSimple/duelQSimple.py |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/duelqsimple.rst b/docs/duelqsimple.rst
index 436456f..9c6af51 100644
--- a/docs/duelqsimple.rst
+++ b/docs/duelqsimple.rst
@@ -27,11 +27,18 @@ You can use this class with:
 Other non exported class
 ------------------------
 These classes need to be imported, if you want to import them with (non exhaustive list):
+
 .. code-block:: python
 
-    from l2rpn_baselines.DuelQSimple.DuelQ_NN import DuelQ_NN
+    from l2rpn_baselines.DuelQSimple.duelQ_NN import DuelQ_NN
+    from l2rpn_baselines.DuelQSimple.duelQ_NN import DuelQ_NNParam
+
+
+.. autoclass:: l2rpn_baselines.DuelQSimple.duelQ_NN.DuelQ_NN
+    :members:
+    :autosummary:
 
 
-.. autoclass:: l2rpn_baselines.DuelQSimple.DuelQ_NN.DuelQ_NN
+.. autoclass:: l2rpn_baselines.DuelQSimple.duelQ_NNParam.DuelQ_NNParam
     :members:
     :autosummary:
diff --git a/l2rpn_baselines/DuelQSimple/duelQSimple.py b/l2rpn_baselines/DuelQSimple/duelQSimple.py
index 35bfb0a..b1be20c 100644
--- a/l2rpn_baselines/DuelQSimple/duelQSimple.py
+++ b/l2rpn_baselines/DuelQSimple/duelQSimple.py
@@ -12,7 +12,7 @@
 
 class DuelQSimple(DeepQAgent):
     """
-    Inheriting from :class:`l2rpn_baselines.DeepQAgent` this class implements the  particular agent used for the
+    Inheriting from :class:`l2rpn_baselines.utils.DeepQAgent` this class implements the  particular agent used for the
     Double Duelling Deep Q network baseline.
 
     It does nothing in particular.

From 3ebe37bb1134387a9a517ce53b0b51dbf26ed765 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 18:18:08 +0100
Subject: [PATCH 13/56] fix references in doc for leapnet encoded

---
 docs/leapnetencoded.rst                         |  8 ++++----
 .../LeapNetEncoded/leapNetEncoded_NN.py         |  8 +++-----
 .../LeapNetEncoded/leapNetEncoded_NNParam.py    | 17 +++++++++++++++--
 l2rpn_baselines/utils/baseDeepQ.py              | 14 +++++++++-----
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/docs/leapnetencoded.rst b/docs/leapnetencoded.rst
index e256d72..852c909 100644
--- a/docs/leapnetencoded.rst
+++ b/docs/leapnetencoded.rst
@@ -39,14 +39,14 @@ These classes need to be imported, if you want to import them with (non exhausti
 
 .. code-block:: python
 
-    from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
-    from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+    from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN import LeapNetEncoded_NN
+    from l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam import LeapNetEncoded_NNParam
 
 
-.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN.LeapNetEncoded_NN
+.. autoclass:: l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NN.LeapNetEncoded_NN
     :members:
     :autosummary:
 
-.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam
+.. autoclass:: l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam.LeapNetEncoded_NNParam
     :members:
     :autosummary:
diff --git a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
index c158699..feee2d6 100644
--- a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
@@ -76,11 +76,7 @@ def __init__(self,
 
     def construct_q_network(self):
         """
-        First the :attr:`l2rpn_baselines.BaseDeepQ.nn_archi` parameters are used to create a neural network
-        to 'encode' the data. Then the leaps occur.
-
-        Afterward the model is split into value an advantage, and treated as usually in any D3QN.
-
+        Builds the Q network.
         """
         # Uses the network architecture found in DeepMind paper
         # The inputs and outputs size have changed, as well as replacing the convolution by dense layers.
@@ -222,12 +218,14 @@ def _process_topo(self, topo_vect):
         """process the topology vector.
 
          As input grid2op encode it:
+         
          - -1 disconnected
          - 1 connected to bus 1
          - 2 connected to bus 2
 
          I transform it in a vector having twice as many component with the encoding, if we move
          "by pairs":
+         
          - [1,0] -> disconnected
          - [0,0] -> connected to bus 1  # normal situation
          - [0,1] -> connected to bus 2
diff --git a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
index bb67d97..713e2dc 100644
--- a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
@@ -50,9 +50,9 @@ class LeapNetEncoded_NNParam(NNParam):
 
     Examples
     --------
-    All other attributes need to be created once by a call to :func:`LeapNetEncoded_NNParam.compute_dims`:
+    All other attributes need to be created once by a call to :func:`l2rpn_baselines.LeapNetEncoded.leapNetEncoded_NNParam.LeapNetEncoded_NNParam.compute_dims`:
 
-    ..code-block:: python
+    .. code-block:: python
 
         nn_archi.compute_dims(env)
         nn_archi.center_reduce(env)
@@ -155,11 +155,21 @@ def __init__(self,
         self.tau_dims = tau_dims
 
     def get_obs_attr(self):
+        """
+        Retrieve the list of the observation attributes that are used for this model.
+        """
         res = self.list_attr_obs_x + self.list_attr_obs_input_q
         res += self.list_attr_obs_tau + ["topo_vect"] + self.list_attr_obs_gm_out
         return res
 
     def compute_dims(self, env):
+        """Compute the dimension of the observations (dimension of x and tau)
+
+        Parameters
+        ----------
+        env : a grid2op environment
+            A grid2op environment
+        """
         self.tau_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_tau]
         self.x_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_x]
         self.gm_out_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_gm_out]
@@ -171,6 +181,9 @@ def _define_adds_mults(self, vector, varname, attr_composed, default_val):
         setattr(self, varname, vector)
 
     def center_reduce(self, env):
+        """
+        Compute some basic statistics for x and tau
+        """
         self._center_reduce_vect(env.get_obs(), "x")
         self._center_reduce_vect(env.get_obs(), "tau")
         self._center_reduce_vect(env.get_obs(), "gm_out")
diff --git a/l2rpn_baselines/utils/baseDeepQ.py b/l2rpn_baselines/utils/baseDeepQ.py
index 15f8f33..043d7c6 100644
--- a/l2rpn_baselines/utils/baseDeepQ.py
+++ b/l2rpn_baselines/utils/baseDeepQ.py
@@ -120,7 +120,8 @@ def construct_q_network(self):
 
     def predict_movement(self, data, epsilon, batch_size=None, training=False):
         """
-        Predict movement of game controler where is epsilon probability randomly move."""
+        Predict movement of game controler where is epsilon probability randomly move.
+        """
         if batch_size is None:
             batch_size = data.shape[0]
 
@@ -133,7 +134,13 @@ def predict_movement(self, data, epsilon, batch_size=None, training=False):
         return opt_policy, q_actions[np.arange(batch_size), opt_policy], q_actions
 
     def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
-        """Trains network to fit given parameters:
+        """
+        Trains network to fit given parameters:
+        
+        .. seealso::
+            https://towardsdatascience.com/dueling-double-deep-q-learning-using-tensorflow-2-x-7bbbcec06a2a
+            for the update rules
+        
         Parameters
         ----------
         s_batch:
@@ -146,9 +153,6 @@ def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, ba
             says whether or not the episode was over
         r_batch:
             the reward obtained this step
-
-        see https://towardsdatascience.com/dueling-double-deep-q-learning-using-tensorflow-2-x-7bbbcec06a2a
-        for the update rules
         """
         if batch_size is None:
             batch_size = s_batch.shape[0]

From b293482c538df76132fa6b459aa5518a55381170 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 27 Jan 2022 18:25:11 +0100
Subject: [PATCH 14/56] fix references in doc for sacold model

---
 docs/sacold.rst                                  |  9 +++++----
 l2rpn_baselines/SACOld/__init__.py               |  4 ++--
 l2rpn_baselines/SACOld/evaluate.py               |  6 +++---
 l2rpn_baselines/SACOld/{SACOld.py => sacOld.py}  | 11 +++++++----
 .../SACOld/{SACOld_NN.py => sacOld_NN.py}        |  0
 .../{SACOld_NNParam.py => sacOld_NNParam.py}     | 12 +++++++-----
 l2rpn_baselines/SACOld/train.py                  | 16 ++++++++++------
 7 files changed, 34 insertions(+), 24 deletions(-)
 rename l2rpn_baselines/SACOld/{SACOld.py => sacOld.py} (63%)
 rename l2rpn_baselines/SACOld/{SACOld_NN.py => sacOld_NN.py} (100%)
 rename l2rpn_baselines/SACOld/{SACOld_NNParam.py => sacOld_NNParam.py} (87%)

diff --git a/docs/sacold.rst b/docs/sacold.rst
index 5147342..8719892 100644
--- a/docs/sacold.rst
+++ b/docs/sacold.rst
@@ -31,16 +31,17 @@ You can use this class with:
 Other non exported class
 ------------------------
 These classes need to be imported, if you want to import them with (non exhaustive list):
+
 .. code-block:: python
 
-    from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
-    from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
+    from l2rpn_baselines.SACOld.sacOld_NN import SACOld_NN
+    from l2rpn_baselines.SACOld.sacOld_NNParam import SACOld_NNParam
 
 
-.. autoclass:: l2rpn_baselines.SACOld.SACOld_NN.SACOld_NN
+.. autoclass:: l2rpn_baselines.SACOld.sacOld_NN.SACOld_NN
     :members:
     :autosummary:
 
-.. autoclass:: l2rpn_baselines.SACOld.SACOld_NNParam.SACOld_NNParam
+.. autoclass:: l2rpn_baselines.SACOld.sacOld_NNParam.SACOld_NNParam
     :members:
     :autosummary:
diff --git a/l2rpn_baselines/SACOld/__init__.py b/l2rpn_baselines/SACOld/__init__.py
index a2ccffb..a72ad4f 100644
--- a/l2rpn_baselines/SACOld/__init__.py
+++ b/l2rpn_baselines/SACOld/__init__.py
@@ -5,7 +5,7 @@
     "SACOld_NNParam"
 ]
 
-from l2rpn_baselines.SACOld.SACOld import SACOld
+from l2rpn_baselines.SACOld.sacOld import SACOld
 from l2rpn_baselines.SACOld.evaluate import evaluate
 from l2rpn_baselines.SACOld.train import train
-from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
+from l2rpn_baselines.SACOld.sacOld_NNParam import SACOld_NNParam
diff --git a/l2rpn_baselines/SACOld/evaluate.py b/l2rpn_baselines/SACOld/evaluate.py
index 4aaabc0..5e6c881 100644
--- a/l2rpn_baselines/SACOld/evaluate.py
+++ b/l2rpn_baselines/SACOld/evaluate.py
@@ -14,9 +14,9 @@
 from grid2op.Runner import Runner
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME
-from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
-from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+from l2rpn_baselines.SACOld.sacOld import SACOld, DEFAULT_NAME
+from l2rpn_baselines.SACOld.sacOld_NNParam import SACOld_NNParam
+from l2rpn_baselines.SACOld.sacOld_NN import SACOld_NN
 
 DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
 DEFAULT_NB_EPISODE = 1
diff --git a/l2rpn_baselines/SACOld/SACOld.py b/l2rpn_baselines/SACOld/sacOld.py
similarity index 63%
rename from l2rpn_baselines/SACOld/SACOld.py
rename to l2rpn_baselines/SACOld/sacOld.py
index 0f28e82..90e8604 100644
--- a/l2rpn_baselines/SACOld/SACOld.py
+++ b/l2rpn_baselines/SACOld/sacOld.py
@@ -7,15 +7,18 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 from l2rpn_baselines.utils import DeepQAgent
-from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+from l2rpn_baselines.SACOld.sacOld_NN import SACOld_NN
 DEFAULT_NAME = "SACOld"
 
 
 class SACOld(DeepQAgent):
     """
-    This is the :class:`l2rpn_baselines.utils` agent representing the SAC agent (old implementation).
+    Do not use this SACOld class that has lots of known (but forgotten) issues.
+    
+    .. warning::
+        We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
+        
+        We will not code any SAC agent "from scratch".
 
-    Please don't use this baseline if you start a new project, prefer using the new, double check
-    SAC implementation instead (:class:`l2rpn_baselines.SAC.SAC`) instead.
     """
     pass
diff --git a/l2rpn_baselines/SACOld/SACOld_NN.py b/l2rpn_baselines/SACOld/sacOld_NN.py
similarity index 100%
rename from l2rpn_baselines/SACOld/SACOld_NN.py
rename to l2rpn_baselines/SACOld/sacOld_NN.py
diff --git a/l2rpn_baselines/SACOld/SACOld_NNParam.py b/l2rpn_baselines/SACOld/sacOld_NNParam.py
similarity index 87%
rename from l2rpn_baselines/SACOld/SACOld_NNParam.py
rename to l2rpn_baselines/SACOld/sacOld_NNParam.py
index 2521842..6d7f675 100644
--- a/l2rpn_baselines/SACOld/SACOld_NNParam.py
+++ b/l2rpn_baselines/SACOld/sacOld_NNParam.py
@@ -8,15 +8,17 @@
 import copy
 
 from l2rpn_baselines.utils import NNParam
-from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+from l2rpn_baselines.SACOld.sacOld_NN import SACOld_NN
 
 
 class SACOld_NNParam(NNParam):
     """
-
-    Do not use this SACOld class, prefer the use of the "more correct"
-    class :class:`l2rpn_baselines.SAC.SAC`  for new projects instead. This module is only here
-    for backward compatibility.
+    Do not use this SACOld class that has lots of known (but forgotten) issues.
+    
+    .. warning::
+        We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
+        
+        We will not code any SAC agent "from scratch".
 
     Attributes
     ----------
diff --git a/l2rpn_baselines/SACOld/train.py b/l2rpn_baselines/SACOld/train.py
index a6712de..ab5523b 100755
--- a/l2rpn_baselines/SACOld/train.py
+++ b/l2rpn_baselines/SACOld/train.py
@@ -12,9 +12,9 @@
 import warnings
 
 from l2rpn_baselines.utils import cli_train
-from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME
-from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
-from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+from l2rpn_baselines.SACOld.sacOld import SACOld, DEFAULT_NAME
+from l2rpn_baselines.SACOld.sacOld_NNParam import SACOld_NNParam
+from l2rpn_baselines.SACOld.sacOld_NN import SACOld_NN
 from l2rpn_baselines.utils import TrainingParam
 from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
 
@@ -32,7 +32,11 @@ def train(env,
           kwargs_archi={}):
     """
     This function implements the "training" part of the baselines "SAC" (old buggy implementation).
-    Please use the :class:`l2rpn_baselines.SAC.SAC` for new projects.
+    
+    .. warning::
+        We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
+        
+        We will not code any SAC agent "from scratch".
 
     Parameters
     ----------
@@ -84,7 +88,7 @@ def train(env,
 
     Examples
     ---------
-    Here is an example on how to train a SAC baseline.
+    Here is an example on how to train a :class:`SACOld` baseline.
 
     First define a python script, for example
 
@@ -211,7 +215,7 @@ def train(env,
     from grid2op.Reward import L2RPNReward
     import re
     try:
-        from lightsim2grid.LightSimBackend import LightSimBackend
+        from lightsim2grid import LightSimBackend
         backend = LightSimBackend()
     except:
         from grid2op.Backend import PandaPowerBackend

From 653ab26ebf91db8c22e22d4a0ef8f4d5a8d9b06c Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Fri, 28 Jan 2022 11:57:53 +0100
Subject: [PATCH 15/56] start to 'implement' a PPO based on rllib framework

---
 .gitignore                             |   5 +-
 CHANGELOG.rst                          |   8 +
 l2rpn_baselines/PPO_RLLIB/__init__.py  |   8 +
 l2rpn_baselines/PPO_RLLIB/env_rllib.py | 101 +++++++++++
 l2rpn_baselines/PPO_RLLIB/train.py     | 237 +++++++++++++++++++++++++
 l2rpn_baselines/PPO_SB3/__init__.py    |  10 +-
 l2rpn_baselines/PPO_SB3/evaluate.py    |   2 +-
 l2rpn_baselines/PPO_SB3/train.py       |  53 ++----
 l2rpn_baselines/PPO_SB3/utils.py       | 110 ++++++++++++
 9 files changed, 494 insertions(+), 40 deletions(-)
 create mode 100644 l2rpn_baselines/PPO_RLLIB/__init__.py
 create mode 100644 l2rpn_baselines/PPO_RLLIB/env_rllib.py
 create mode 100644 l2rpn_baselines/PPO_RLLIB/train.py

diff --git a/.gitignore b/.gitignore
index fe9ebcc..2da2283 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,4 +175,7 @@ test_eva_dn.py
 test_import_pposb.py
 test_make_gym_env.py
 test_multifolderwithcache.py
-l2rpn_baselines/PPO_SB3/saved_model/**
\ No newline at end of file
+l2rpn_baselines/PPO_SB3/saved_model/**
+l2rpn_baselines/PPO_SB3/logs/**
+l2rpn_baselines/PPO_RLLIB/logs/**
+l2rpn_baselines/PPO_RLLIB/saved_model/**
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3688105..ae1d3f8 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,9 +1,17 @@
 Change Log
 ===========
+
 [TODO]
 --------
+- wirte github issue templates for normal bugs and also for contributions
 - code a baseline example using ray / rllib
 - code a baseline example using mazerl
+- code a baseline using deepmind acme
+- code a baseline with a GNN somewhere
+- show an example on how to use some "heuristic" in training / evaluation of trained agents
+- show an example of model based RL agent
+- train sowhere a working baseline (that does better than do nothing)
+- refactor the `utils.DeepQAgent` to split the different part better: starting at different steps, not training for a certain number of steps, sampling hard scenarios etc.
 - stack multiple states in `utils/DeepQAgent`
 
 [0.6.0] - 2022-xx-yy
diff --git a/l2rpn_baselines/PPO_RLLIB/__init__.py b/l2rpn_baselines/PPO_RLLIB/__init__.py
new file mode 100644
index 0000000..c30990a
--- /dev/null
+++ b/l2rpn_baselines/PPO_RLLIB/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
diff --git a/l2rpn_baselines/PPO_RLLIB/env_rllib.py b/l2rpn_baselines/PPO_RLLIB/env_rllib.py
new file mode 100644
index 0000000..0f66ad0
--- /dev/null
+++ b/l2rpn_baselines/PPO_RLLIB/env_rllib.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import gym
+import grid2op
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
+from l2rpn_baselines.PPO_SB3 import remove_non_usable_attr
+
+
+class Env_RLLIB(gym.Env):
+    """
+    This class represents the Environment usable
+    from rllib, mapping a grid2op environment.
+    
+    It is primarily made to serve as example of what is possible to achieve.
+    You might probably want to customize this environment to your specific
+    needs.
+
+    This agents uses the rllib framework to code for 
+    a neural network.
+    
+    .. warning::
+        A grid2op environment is created  when this agent is made. We found
+        out rllib worked better this way.
+        
+    To be built, it requires the `env_config` parameters. This parameter is a 
+    dictionnary with keys:
+    
+    - "env_name": the name of the environment you want to make
+    - "obs_attr_to_keep": the attributes of the observation you want to use
+      in the gym observation space (gym observation space is converted
+      to a Box)
+    - "act_attr_to_keep" : the attributes of the action you want to use in 
+      the gym action space (gym action space is also converted to a 
+      Box)
+    - "backend_class": the type of backed to use
+    - "backend_kwargs": the extra key word arguments to used when creating 
+      the backend
+    - all other arguments are passed to `grid2op.make(...)` function
+    
+    """
+    
+    def __init__(self, env_config):
+        # boilerplate code...
+        # retrieve the information
+        if not "env_name" in env_config:
+            raise RuntimeError("The configuration for RLLIB should provide the env name")
+        
+        nm_env = env_config["env_name"]
+        del env_config["env_name"]
+        obs_attr_to_keep = None
+        if "obs_attr_to_keep" in env_config:
+            obs_attr_to_keep = env_config["obs_attr_to_keep"]
+            del  env_config["obs_attr_to_keep"]
+        act_attr_to_keep = None
+        if "act_attr_to_keep" in env_config:  
+            act_attr_to_keep = env_config["act_attr_to_keep"]
+            del  env_config["act_attr_to_keep"]
+        if "backend_class" in env_config:
+            backend_kwargs = {}
+            if "backend_kwargs" in env_config:
+                backend_kwargs = env_config["backend_kwargs"]
+                del env_config["backend_kwargs"]
+            backend = env_config["backend_class"](**backend_kwargs)
+            del  env_config["backend_class"]
+            
+        # 1. create the grid2op environment
+        self.env_glop = grid2op.make(nm_env, backend=backend, **env_config)
+        # clean the attribute
+        act_attr_to_keep = remove_non_usable_attr(self.env_glop, act_attr_to_keep)
+        
+        # 2. create the gym environment
+        self.env_gym = GymEnv(self.env_glop)
+
+        # 3. customize action space
+        if obs_attr_to_keep is not None:
+            self.env_gym.observation_space.close()
+            self.env_gym.observation_space =  BoxGymObsSpace(self.env_glop.observation_space,
+                                                             attr_to_keep=obs_attr_to_keep)
+        
+        if act_attr_to_keep is not None:    
+            self.env_gym.action_space.close()
+            self.env_gym.action_space = BoxGymActSpace(self.env_glop.action_space,
+                                                       attr_to_keep=act_attr_to_keep)
+
+        # 4. specific to rllib
+        self.action_space = self.env_gym.action_space
+        self.observation_space = self.env_gym.observation_space
+
+    def reset(self):
+        obs = self.env_gym.reset()
+        return obs
+
+    def step(self, action):
+        obs, reward, done, info = self.env_gym.step(action)
+        return obs, reward, done, info
diff --git a/l2rpn_baselines/PPO_RLLIB/train.py b/l2rpn_baselines/PPO_RLLIB/train.py
new file mode 100644
index 0000000..c00c141
--- /dev/null
+++ b/l2rpn_baselines/PPO_RLLIB/train.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import grid2op
+import copy
+from l2rpn_baselines.PPO_RLLIB.env_rllib import Env_RLLIB
+from l2rpn_baselines.PPO_SB3 import (default_obs_attr_to_keep, 
+                                     default_act_attr_to_keep,
+                                     save_used_attribute
+                                    )
+
+import ray
+from ray.rllib.agents import ppo
+from ray.tune.logger import pretty_print
+
+def train(env,
+          name="ppo_rllib",
+          iterations=1,
+          save_path=None,
+          load_path=None,  # TODO
+          net_arch=None,
+          learning_rate=3e-4,
+          verbose=False,
+          save_every_xxx_steps=None,
+          obs_attr_to_keep=copy.deepcopy(default_obs_attr_to_keep),
+          act_attr_to_keep=copy.deepcopy(default_act_attr_to_keep),
+          env_kwargs=None,
+          **kwargs):
+    """
+    This function will use the rllib to train a PPO agent on
+    a grid2op environment "env".
+
+    It will use the grid2op "gym_compat" module to convert the action space
+    to a BoxActionSpace and the observation to a BoxObservationSpace.
+
+    It is suited for the studying the impact of continuous actions:
+
+    - on storage units
+    - on dispatchable generators
+    - on generators with renewable energy sources
+
+    .. warning::
+        The environment used by RLLIB is copied and remade. This class does
+        not work if you over specialize the environment !
+        For example, opponent is not taken into account (yet), nor the chronics class
+        etc.
+        
+        If you want such level of control, please use the `env_kwargs` parameters !
+        
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+        
+        Only the name of the environment, and its backend is used. The rest will
+        be created by rllib.
+
+    name: ``str```
+        The name of your agent.
+
+    iterations: ``int``
+        For how many iterations do you want to train the model.
+        These are **NOT** steps, but ray internal number of iterations.
+        For some experiments we performed,  
+
+    save_path: ``str``
+        Where do you want to save your baseline.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    net_arch:
+        The neural network architecture, used to create the neural network
+        of the PPO (see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html)
+
+    learning_rate: ``float``
+        The learning rate, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html
+
+    save_every_xxx_steps: ``int``
+        If set (by default it's None) the stable baselines3 model will be saved
+        to the hard drive each `save_every_xxx_steps` steps performed in the
+        environment.
+
+    obs_attr_to_keep: list of string
+        Grid2op attribute to use to build the BoxObservationSpace. It is passed
+        as the "attr_to_keep" value of the
+        BoxObservation space (see
+        https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymObsSpace)
+
+    act_attr_to_keep: list of string
+        Grid2op attribute to use to build the BoxGymActSpace. It is passed
+        as the "attr_to_keep" value of the
+        BoxAction space (see
+        https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymActSpace)
+
+    verbose: ``bool``
+        If you want something to be printed on the terminal (a better logging strategy will be put at some point)
+
+    env_kwargs: Optional[dict]
+        Extra key word arguments passed to the building of the 
+        grid2op environment.
+        
+    kwargs:
+        extra parameters passed to the trainer from rllib
+
+    Returns
+    -------
+
+    baseline: 
+        The trained baseline as a stable baselines PPO element.
+
+
+    .. _Example-ppo_stable_baseline:
+
+    Examples
+    ---------
+
+    Here is an example on how to train a ppo_stablebaseline .
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import re
+        import grid2op
+        from grid2op.Reward import LinesCapacityReward  # or any other rewards
+        from grid2op.Chronics import MultifolderWithCache  # highly recommended
+        from lightsim2grid import LightSimBackend  # highly recommended for training !
+            
+        env_name = "l2rpn_case14_sandbox"
+        env = grid2op.make(env_name,
+                           backend=LightSimBackend())
+            
+        try:
+            train(env,
+                iterations=10,  # any number of iterations you want
+                save_path="./saved_model",  # where the NN weights will be saved
+                name="test",  # name of the baseline
+                net_arch=[100, 100, 100],  # architecture of the NN
+                save_every_xxx_steps=2,  # save the NN every 2 training steps
+                env_kwargs={"reward_class": LinesCapacityReward,
+                            "chronics_class": MultifolderWithCache,  # highly recommended
+                            "data_feeding_kwargs": {
+                                'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
+                                }
+                },
+                verbose=True
+                )
+        finally:
+            env.close()
+    
+    """
+    ray.init()
+
+    
+    if save_path is not None:
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+            
+        path_expe = os.path.join(save_path, name)
+        if not os.path.exists(path_expe):
+            os.mkdir(path_expe)
+            
+    # save the attributes kept
+    need_saving = save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
+    
+    if env_kwargs is None:
+        env_kwargs = {}
+    
+    env_params = env.get_kwargs()
+    env_config = {"env_name": env.env_name,
+                  "backend_class": env_params["_raw_backend_class"],
+                  "obs_attr_to_keep": default_obs_attr_to_keep,
+                  "act_attr_to_keep": default_act_attr_to_keep, 
+                  **env_kwargs}
+    
+    # then define a "trainer"
+    trainer = ppo.PPOTrainer(env=Env_RLLIB, config={
+        # config to pass to env class
+        "env_config": env_config,
+        #neural network config
+        "lr": learning_rate,
+        "model": {
+            "fcnet_hiddens": net_arch,
+        },
+        **kwargs
+    })
+    for step in range(iterations):
+        # Perform one iteration of training the policy with PPO
+        result = trainer.train()
+        if verbose:
+            print(pretty_print(result))
+
+        if need_saving and step % save_every_xxx_steps == 0:
+            checkpoint = trainer.save(checkpoint_dir=path_expe)
+            
+    checkpoint = trainer.save(checkpoint_dir=path_expe)
+    ray.shutdown()
+    return trainer
+    
+    
+if __name__ == "__main__":
+    import re
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended
+    from lightsim2grid import LightSimBackend  # highly recommended for training !
+        
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                       backend=LightSimBackend())
+        
+    try:
+        train(env,
+              iterations=10,  # any number of iterations you want
+              save_path="./saved_model",  # where the NN weights will be saved
+              name="test",  # name of the baseline
+              net_arch=[100, 100, 100],  # architecture of the NN
+              save_every_xxx_steps=2,  # save the NN every 2 training steps
+              env_kwargs={"reward_class": LinesCapacityReward,
+                          "chronics_class": MultifolderWithCache,  # highly recommended
+                          "data_feeding_kwargs": {
+                              'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
+                              }
+              },
+              verbose=True
+              )
+    finally:
+        env.close()
+
diff --git a/l2rpn_baselines/PPO_SB3/__init__.py b/l2rpn_baselines/PPO_SB3/__init__.py
index aef1732..6d3d55d 100644
--- a/l2rpn_baselines/PPO_SB3/__init__.py
+++ b/l2rpn_baselines/PPO_SB3/__init__.py
@@ -9,9 +9,17 @@
 __all__ = [
     "evaluate",
     "train",
-    "PPO_SB3"
+    "PPO_SB3",
+    "default_act_attr_to_keep",
+    "default_obs_attr_to_keep",
+    "remove_non_usable_attr",
+    "save_used_attribute"
 ]
 
 from l2rpn_baselines.PPO_SB3.utils import SB3Agent as PPO_SB3
+from l2rpn_baselines.PPO_SB3.utils import (default_act_attr_to_keep,
+                                           default_obs_attr_to_keep,
+                                           remove_non_usable_attr,
+                                           save_used_attribute)
 from l2rpn_baselines.PPO_SB3.evaluate import evaluate
 from l2rpn_baselines.PPO_SB3.train import train
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index eb440e0..85582a0 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -94,7 +94,7 @@ def evaluate(env,
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
         from grid2op.Chronics import MultifolderWithCache  # highly recommended
         from lightsim2grid import LightSimBackend  # highly recommended !
-        from l2rpn_baselines.ppo_stablebaselines import evaluate
+        from l2rpn_baselines.PPO_SB3 import evaluate
 
         nb_episode = 7
         nb_process = 1
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index 9c7bc2d..78af502 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -29,16 +29,14 @@ class MlpPolicy(object):
         It represents `from stable_baselines3.ppo import MlpPolicy`
         """
     
-_default_obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
-                             "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
-                             "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
-                             "storage_power", "storage_charge"]
-
-_default_act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
+from l2rpn_baselines.PPO_SB3.utils import (default_obs_attr_to_keep, 
+                                           default_act_attr_to_keep,
+                                           remove_non_usable_attr,
+                                           save_used_attribute)
 
 
 def train(env,
-          name="ppo_stable_baselines",
+          name="PPO_SB3",
           iterations=1,
           save_path=None,
           load_path=None,
@@ -47,8 +45,8 @@ def train(env,
           learning_rate=3e-4,
           save_every_xxx_steps=None,
           model_policy=MlpPolicy,
-          obs_attr_to_keep=copy.deepcopy(_default_obs_attr_to_keep),
-          act_attr_to_keep=copy.deepcopy(_default_act_attr_to_keep),
+          obs_attr_to_keep=copy.deepcopy(default_obs_attr_to_keep),
+          act_attr_to_keep=copy.deepcopy(default_act_attr_to_keep),
           **kwargs):
     """
     This function will use stable baselines 3 to train a PPO agent on
@@ -141,7 +139,7 @@ def train(env,
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
         from grid2op.Chronics import MultifolderWithCache  # highly recommended
         from lightsim2grid import LightSimBackend  # highly recommended for training !
-        from l2rpn_baselines.ppo_stablebaselines import train
+        from l2rpn_baselines.PPO_SB3 import train
 
         env_name = "l2rpn_case14_sandbox"
         env = grid2op.make(env_name,
@@ -169,32 +167,14 @@ def train(env,
     """
     if not _CAN_USE_STABLE_BASELINE:
         raise ImportError("Cannot use this function as stable baselines3 is not installed")
-    if act_attr_to_keep == _default_act_attr_to_keep:
-        # by default, i remove all the attributes that are not supported by the action type
-        # i do not do that if the user specified specific attributes to keep. This is his responsibility in
-        # in this case
-        modif_attr = []
-        for el in act_attr_to_keep:
-            if env.action_space.supports_type(el):
-                modif_attr.append(el)
-            else:
-                warnings.warn(f"attribute {el} cannot be processed by the allowed "
-                              "action type. It has been removed from the "
-                              "gym space as well.")
-        act_attr_to_keep = modif_attr
-
+    
+    # keep only usable attributes (if default is used)
+    act_attr_to_keep = remove_non_usable_attr(env, act_attr_to_keep)
+    
+    # save the attributes kept
     if save_path is not None:
-        # save the attributes kept
         my_path = os.path.join(save_path, name)
-        if not os.path.exists(save_path):
-            os.mkdir(save_path)
-        if not os.path.exists(my_path):
-            os.mkdir(my_path)
-
-        with open(os.path.join(my_path, "obs_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
-            json.dump(fp=f, obj=obs_attr_to_keep)
-        with open(os.path.join(my_path, "act_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
-            json.dump(fp=f, obj=act_attr_to_keep)
+    save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
 
     # define the gym environment from the grid2op env
     env_gym = GymEnv(env)
@@ -264,12 +244,11 @@ def train(env,
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
-
     train(env,
-          iterations=10_000_000,
+          iterations=1_000,
           logs_dir="./logs",
           save_path="./saved_model", 
-          name="test2",
+          name="test3",
           net_arch=[200, 200, 200],
           save_every_xxx_steps=2000,
           )
diff --git a/l2rpn_baselines/PPO_SB3/utils.py b/l2rpn_baselines/PPO_SB3/utils.py
index 250c090..72481b2 100644
--- a/l2rpn_baselines/PPO_SB3/utils.py
+++ b/l2rpn_baselines/PPO_SB3/utils.py
@@ -6,6 +6,10 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+import warnings
+import os
+import json
+from typing import List, Optional
 
 from l2rpn_baselines.utils import GymAgent
 
@@ -19,9 +23,115 @@ class PPO(object):
         
         It represents `from stable_baselines3 import PPO`
         """
+        
+        
+default_obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                            "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                            "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                            "storage_power", "storage_charge"]
+
+
+default_act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
 
 
+def remove_non_usable_attr(grid2openv, act_attr_to_keep: List[str]) -> List[str]:
+    """This function modifies the attribute (of the actions)
+    to remove the one that are non usable with your gym environment.
+    
+    If only filters things if the default variables are used 
+    (see _default_act_attr_to_keep)
+
+    Parameters
+    ----------
+    grid2openv : grid2op.Environment.Environment
+        The used grid2op environment
+    act_attr_to_keep : List[str]
+        The attributes of the actions to keep.
+
+    Returns
+    -------
+    List[str]
+        The same as `act_attr_to_keep` if the user modified the default.
+        Or the attributes usable by the environment from the default list.
+        
+    """
+    modif_attr = act_attr_to_keep
+    if act_attr_to_keep == default_act_attr_to_keep:
+        # by default, i remove all the attributes that are not supported by the action type
+        # i do not do that if the user specified specific attributes to keep. This is his responsibility in
+        # in this case
+        modif_attr = []
+        for el in act_attr_to_keep:
+            if grid2openv.action_space.supports_type(el):
+                modif_attr.append(el)
+            else:
+                warnings.warn(f"attribute {el} cannot be processed by the allowed "
+                                "action type. It has been removed from the "
+                                "gym space as well.")
+    return modif_attr
+
+
+def save_used_attribute(save_path: Optional[str],
+                        name: str,
+                        obs_attr_to_keep: List[str],
+                        act_attr_to_keep: List[str]) -> bool:
+    """Serialize, as jon the obs_attr_to_keep and act_attr_to_keep
+    
+    This is typically called in the `train` function.
+
+    Parameters
+    ----------
+    save_path : Optional[str]
+        where to save the used attributes (put ``None`` if you don't want to
+        save it)
+    name : str
+        Name of the model
+    obs_attr_to_keep : List[str]
+        List of observation attributes to keep
+    act_attr_to_keep : List[str]
+        List of action attributes to keep
+
+    Returns
+    -------
+    bool
+        whether the data have been saved or not
+    """
+    res = False
+    if save_path is not None:
+        my_path = os.path.join(save_path, name)
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+        if not os.path.exists(my_path):
+            os.mkdir(my_path)
+
+        with open(os.path.join(my_path, "obs_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
+            json.dump(fp=f, obj=obs_attr_to_keep)
+        with open(os.path.join(my_path, "act_attr_to_keep.json"), encoding="utf-8", mode="w") as f:
+            json.dump(fp=f, obj=act_attr_to_keep)
+        res = True
+    return res
+      
+            
 class SB3Agent(GymAgent):
+    """This class represents the Agent (directly usable with grid2op framework)
+
+    This agents uses the stable-baselines3 `nn_type` (by default PPO) as
+    the neural network to take decisions on the grid.
+    
+    To be built, it requires:
+    
+    - `g2op_action_space`: a grid2op action space (used for initializing the grid2op agent)
+    - `gym_act_space`: a gym observation space (used for the neural networks)
+    - `gym_obs_space`: a gym action space (used for the neural networks)
+
+    It can also accept different types of parameters:
+    
+    - `nn_type`: the type of "neural network" from stable baselines (by default PPO)
+    - `nn_path`: the path where the neural network can be loaded from
+    - `nn_kwargs`: the parameters used to build the neural network from scratch.
+    
+    Exactly one of `nn_path` and `nn_kwargs` should be provided. No more, no less.
+    """
     def __init__(self,
                  g2op_action_space,
                  gym_act_space,

From eb72533d3d2309cef8e0e0fb44fc23c434548365 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Fri, 28 Jan 2022 16:18:32 +0100
Subject: [PATCH 16/56] can do the  train -> eval for PPO_RLLIB, need to
 improve the train, do the docs and tests

---
 .gitignore                              |   1 +
 CHANGELOG.rst                           |   2 +
 l2rpn_baselines/PPO_RLLIB/evaluate.py   | 243 ++++++++++++++++++++++++
 l2rpn_baselines/PPO_RLLIB/rllibagent.py | 138 ++++++++++++++
 l2rpn_baselines/PPO_RLLIB/train.py      |  48 +++--
 l2rpn_baselines/PPO_SB3/evaluate.py     |  93 ++-------
 l2rpn_baselines/utils/gymAgent.py       |   8 +-
 setup.py                                |  18 +-
 8 files changed, 445 insertions(+), 106 deletions(-)
 create mode 100644 l2rpn_baselines/PPO_RLLIB/evaluate.py
 create mode 100644 l2rpn_baselines/PPO_RLLIB/rllibagent.py

diff --git a/.gitignore b/.gitignore
index 2da2283..a6e0dd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,4 @@ l2rpn_baselines/PPO_SB3/saved_model/**
 l2rpn_baselines/PPO_SB3/logs/**
 l2rpn_baselines/PPO_RLLIB/logs/**
 l2rpn_baselines/PPO_RLLIB/saved_model/**
+test_jsonpickle.json
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index ae1d3f8..b5727b1 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,8 @@ Change Log
 [TODO]
 --------
 - wirte github issue templates for normal bugs and also for contributions
+- in the "examples" folder, make some examples for possible "submissions"
+  usable in the competition for PPO_SB3 and PPO_RLLIB
 - code a baseline example using ray / rllib
 - code a baseline example using mazerl
 - code a baseline using deepmind acme
diff --git a/l2rpn_baselines/PPO_RLLIB/evaluate.py b/l2rpn_baselines/PPO_RLLIB/evaluate.py
new file mode 100644
index 0000000..58a021f
--- /dev/null
+++ b/l2rpn_baselines/PPO_RLLIB/evaluate.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import json
+from grid2op.Runner import Runner
+
+from l2rpn_baselines.utils.save_log_gif import save_log_gif
+
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace
+
+from l2rpn_baselines.PPO_RLLIB.rllibagent import RLLIBAgent
+
+def evaluate(env,
+             load_path=".",
+             name="ppo_rllib",
+             logs_path=None,
+             nb_episode=1,
+             nb_process=1,
+             max_steps=-1,
+             verbose=False,
+             save_gif=False,
+             **kwargs):
+    """
+    This function will use rllib package to evalute a previously trained
+    PPO agent (with rllib) on a grid2op environment "env".
+
+    It will use the grid2op "gym_compat" module to convert the action space
+    to a BoxActionSpace and the observation to a BoxObservationSpace.
+
+    It is suited for the studying the impact of continuous actions:
+
+    - on storage units
+    - on dispatchable generators
+    - on generators with renewable energy sources
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+
+    name: ``str```
+        The name of your agent.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    logs_dir: ``str``
+        Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
+    
+    nb_episode: ``str``
+        How many episodes to run during the assessment of the performances
+
+    nb_process: ``int``
+        On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be
+        unstable on some plaform)
+
+    max_steps: ``int``
+        How many steps at maximum your agent will be assessed
+
+    verbose: ``bool``
+        Currently un used
+
+    save_gif: ``bool``
+        Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
+        take a lot of ram) and drastically increase computation time.
+
+    kwargs:
+        extra parameters passed to the PPO from stable baselines 3
+
+    Returns
+    -------
+
+    baseline: 
+        The loaded baseline as a stable baselines PPO element.
+
+    Examples
+    ---------
+
+    Here is an example on how to train a ppo_stablebaseline .
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.Reward import LinesCapacityReward  # or any other rewards
+        from lightsim2grid import LightSimBackend  # highly recommended !
+        from l2rpn_baselines.PPO_RLLIB import evaluate
+
+        nb_episode = 7
+        nb_process = 1
+        verbose = True
+
+        env_name = "l2rpn_case14_sandbox"
+        env = grid2op.make(env_name,
+                           reward_class=LinesCapacityReward,
+                           backend=LightSimBackend()
+                           )
+
+        try:
+            evaluate(env,
+                    nb_episode=nb_episode,
+                    load_path="./saved_model",  # should be the same as what has been called in the train function !
+                    name="test",  # should be the same as what has been called in the train function !
+                    nb_process=1,
+                    verbose=verbose,
+                    )
+
+            # you can also compare your agent with the do nothing agent relatively
+            # easily
+            runner_params = env.get_params_for_runner()
+            runner = Runner(**runner_params)
+
+            res = runner.run(nb_episode=nb_episode,
+                            nb_process=nb_process
+                            )
+
+            # Print summary
+            if verbose:
+                print("Evaluation summary for DN:")
+                for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                    msg_tmp = "chronics at: {}".format(chron_name)
+                    msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                    msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                    print(msg_tmp)
+        finally:
+            env.close()
+
+    """
+    import jsonpickle  # lazy loading to save import time
+    
+    # load the attributes kept
+    my_path = os.path.join(load_path, name)
+    if not os.path.exists(load_path):
+        os.mkdir(load_path)
+    if not os.path.exists(my_path):
+        os.mkdir(my_path)
+        
+    with open(os.path.join(my_path, "obs_attr_to_keep.json"), encoding="utf-8", mode="r") as f:
+        obs_attr_to_keep = json.load(fp=f)
+    with open(os.path.join(my_path, "act_attr_to_keep.json"), encoding="utf-8", mode="r") as f:
+        act_attr_to_keep = json.load(fp=f)
+
+    # create the action and observation space
+    gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
+    gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+    
+    # retrieve the env config (for rllib)
+    with open(os.path.join(my_path, "env_config.json"), "r", encoding="utf-8") as f:
+        str_ = f.read()
+    env_config_ppo = jsonpickle.decode(str_)
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    full_path = os.path.join(load_path, name)
+    grid2op_agent = RLLIBAgent(env.action_space,
+                               gym_action_space,
+                               gym_observation_space,
+                               nn_config=env_config_ppo,
+                               nn_path=os.path.join(full_path))
+
+    # Build runner
+    runner_params = env.get_params_for_runner()
+    runner_params["verbose"] = verbose
+    runner = Runner(**runner_params,
+                    agentClass=None,
+                    agentInstance=grid2op_agent)
+    
+    # Run the agent on the scenarios
+    if logs_path is not None:
+        os.makedirs(logs_path, exist_ok=True)
+
+    res = runner.run(path_save=logs_path,
+                     nb_episode=nb_episode,
+                     nb_process=nb_process,
+                     max_iter=max_steps,
+                     pbar=verbose,
+                     **kwargs)
+
+    # Print summary
+    if verbose:
+        print("Evaluation summary:")
+        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+            msg_tmp = "chronics at: {}".format(chron_name)
+            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+            print(msg_tmp)
+
+    if save_gif:
+        if verbose:
+            print("Saving the gif of the episodes")
+        save_log_gif(logs_path, res)
+    return grid2op_agent, res
+
+if __name__ == "__main__":
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+
+    nb_episode = 7
+    nb_process = 1
+    verbose = True
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                        reward_class=LinesCapacityReward,
+                        backend=LightSimBackend()
+                        )
+
+    try:
+        evaluate(env,
+                 nb_episode=nb_episode,
+                 load_path="./saved_model",  # should be the same as what has been called in the train function !
+                 name="test",  # should be the same as what has been called in the train function !
+                 nb_process=1,
+                 verbose=verbose,
+                 )
+
+        # you can also compare your agent with the do nothing agent relatively
+        # easily
+        runner_params = env.get_params_for_runner()
+        runner = Runner(**runner_params)
+
+        res = runner.run(nb_episode=nb_episode,
+                        nb_process=nb_process
+                        )
+
+        # Print summary
+        if verbose:
+            print("Evaluation summary for DN:")
+            for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                msg_tmp = "chronics at: {}".format(chron_name)
+                msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                print(msg_tmp)
+    finally:
+        env.close()
\ No newline at end of file
diff --git a/l2rpn_baselines/PPO_RLLIB/rllibagent.py b/l2rpn_baselines/PPO_RLLIB/rllibagent.py
new file mode 100644
index 0000000..4de4a0d
--- /dev/null
+++ b/l2rpn_baselines/PPO_RLLIB/rllibagent.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import copy
+import os
+import re
+from typing import List, Optional
+
+from l2rpn_baselines.utils import GymAgent
+from l2rpn_baselines.PPO_RLLIB.env_rllib import Env_RLLIB
+
+try:
+    from ray.rllib.agents.ppo import PPOTrainer
+    _CAN_USE_RLLIB = True
+except ImportError:
+    _CAN_USE_RLLIB = False
+    
+    class PPOTrainer(object):
+        """
+        Do not use, this class is a template when rllib is not installed.
+        
+        It represents `from ray.rllib.agents.ppo import PPOTrainer`
+        """
+        
+
+class RLLIBAgent(GymAgent):
+    """This class represents the Agent (directly usable with grid2op framework)
+
+    This agents uses the stable baseline `nn_type` (by default PPOTrainer) as
+    the neural network to take decisions on the grid.
+    
+    To be built, it requires:
+    
+    - `g2op_action_space`: a grid2op action space (used for initializing the grid2op agent)
+    - `gym_act_space`: a gym observation space (used for the neural networks)
+    - `gym_obs_space`: a gym action space (used for the neural networks)
+    - `nn_config`: the parameters used to build the rllib "trainer" (the thing
+      tagged "nn_config" in rllib)
+    
+    It can also accept different types of parameters:
+    
+    - `nn_type`: the type of "neural network" from rllib (by default PPOTrainer)
+    - `nn_path`: the path where the neural network can be loaded from
+    
+    For this class `nn_config` is mandatory. The trainer is built with:
+    
+    .. code-block:: python
+    
+        from l2rpn_baselines.PPO_RLLIB import Env_RLLIB
+        PPOTrainer(env=Env_RLLIB, config=nn_config)
+    
+    """
+    def __init__(self,
+                 g2op_action_space,
+                 gym_act_space,
+                 gym_obs_space,
+                 nn_config,
+                 nn_type=PPOTrainer,
+                 nn_path=None,
+                 ):
+        if not _CAN_USE_RLLIB:
+            raise ImportError("Cannot import ray[rllib]. Impossible to use this class.")
+        
+        self._nn_type = nn_type
+        if nn_config is None:
+            raise RuntimeError("For RLLIB agent you need to provide nn_kwargs")
+        self._nn_config = nn_config
+        
+        nn_kwargs = {"env": Env_RLLIB,
+                     "config": nn_config
+                    }
+        super().__init__(g2op_action_space, gym_act_space, gym_obs_space,
+                         nn_path=nn_path, nn_kwargs=nn_kwargs,
+                         _check_both_set=False,)
+        
+    def get_act(self, gym_obs, reward, done):
+        """Retrieve the gym action from the gym observation and the reward. 
+        It only (for now) work for non recurrent policy.
+
+        Parameters
+        ----------
+        gym_obs : gym observation
+            The gym observation
+        reward : ``float``
+            the current reward
+        done : ``bool``
+            whether the episode is over or not.
+
+        Returns
+        -------
+        gym action
+            The gym action, that is processed in the :func:`GymAgent.act`
+            to be used with grid2op
+        """
+        action = self.nn_model.compute_single_action(gym_obs)
+        return action
+
+    def load(self):
+        """
+        Load the NN model.
+        
+        In the case of a PPO agent, this is equivalent to perform the:
+        
+        .. code-block:: python
+            
+            PPOTrainer.restore(nn_path)
+            
+        """
+        self.build()
+        chkts = sorted(os.listdir(self._nn_path))
+        last_chkts = [re.match("checkpoint_[0-9]+$", el) is not None
+                     for el in chkts]
+        last_chkts = [el for el, ok_ in zip(chkts, last_chkts) if ok_]
+        last_chkt_path = last_chkts[-1]
+        last_chkt_path = os.path.join(self._nn_path, last_chkt_path)
+        possible_chkt = [el for el in os.listdir(last_chkt_path)
+                         if re.match(".*.tune_metadata$", el) is not None]
+        assert len(possible_chkt)
+        last_chkt = possible_chkt[-1]
+        last_chkt = re.sub(r"\.tune_metadata$", "", last_chkt)
+        self.nn_model.restore(checkpoint_path=os.path.join(last_chkt_path, last_chkt))
+        
+    def build(self):
+        """Create the underlying NN model from scratch.
+        
+        In the case of a PPO agent, this is equivalent to perform the:
+        
+        .. code-block:: python
+            
+            PPOTrainer(env= Env_RLLIB, config=nn_config)
+            
+        """
+        self.nn_model = PPOTrainer(**self._nn_kwargs)
diff --git a/l2rpn_baselines/PPO_RLLIB/train.py b/l2rpn_baselines/PPO_RLLIB/train.py
index c00c141..268db99 100644
--- a/l2rpn_baselines/PPO_RLLIB/train.py
+++ b/l2rpn_baselines/PPO_RLLIB/train.py
@@ -12,12 +12,17 @@
 from l2rpn_baselines.PPO_RLLIB.env_rllib import Env_RLLIB
 from l2rpn_baselines.PPO_SB3 import (default_obs_attr_to_keep, 
                                      default_act_attr_to_keep,
-                                     save_used_attribute
+                                     save_used_attribute,
+                                     remove_non_usable_attr
                                     )
 
-import ray
-from ray.rllib.agents import ppo
-from ray.tune.logger import pretty_print
+try:
+    import ray
+    from ray.rllib.agents import ppo
+    from ray.tune.logger import pretty_print
+    _CAN_USE_RLLIB = True
+except ImportError as exc_:
+    _CAN_USE_RLLIB = False
 
 def train(env,
           name="ppo_rllib",
@@ -130,6 +135,7 @@ def train(env,
 
         import re
         import grid2op
+        import ray
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
         from grid2op.Chronics import MultifolderWithCache  # highly recommended
         from lightsim2grid import LightSimBackend  # highly recommended for training !
@@ -137,7 +143,8 @@ def train(env,
         env_name = "l2rpn_case14_sandbox"
         env = grid2op.make(env_name,
                            backend=LightSimBackend())
-            
+
+        ray.init()  # if needed (you might have it already working somewhere)
         try:
             train(env,
                 iterations=10,  # any number of iterations you want
@@ -155,10 +162,12 @@ def train(env,
                 )
         finally:
             env.close()
+            ray.shutdown()  # if needed (you might have it already working somewhere)
     
     """
-    ray.init()
-
+    import jsonpickle
+    if not _CAN_USE_RLLIB:
+        raise ImportError("RLLIB is not installed on your machine")
     
     if save_path is not None:
         if not os.path.exists(save_path):
@@ -169,6 +178,7 @@ def train(env,
             os.mkdir(path_expe)
             
     # save the attributes kept
+    act_attr_to_keep = remove_non_usable_attr(env, act_attr_to_keep)
     need_saving = save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
     
     if env_kwargs is None:
@@ -181,8 +191,7 @@ def train(env,
                   "act_attr_to_keep": default_act_attr_to_keep, 
                   **env_kwargs}
     
-    # then define a "trainer"
-    trainer = ppo.PPOTrainer(env=Env_RLLIB, config={
+    env_config_ppo = {
         # config to pass to env class
         "env_config": env_config,
         #neural network config
@@ -191,7 +200,16 @@ def train(env,
             "fcnet_hiddens": net_arch,
         },
         **kwargs
-    })
+    }
+    
+    # store it
+    encoded = jsonpickle.encode(env_config_ppo)
+    with open(os.path.join(path_expe, "env_config.json"), "w", encoding="utf-8") as f:
+        f.write(encoded)
+    
+    # then define a "trainer"
+    # TODO what if we want to restore it !
+    trainer = ppo.PPOTrainer(env=Env_RLLIB, config=env_config_ppo)
     for step in range(iterations):
         # Perform one iteration of training the policy with PPO
         result = trainer.train()
@@ -202,7 +220,6 @@ def train(env,
             checkpoint = trainer.save(checkpoint_dir=path_expe)
             
     checkpoint = trainer.save(checkpoint_dir=path_expe)
-    ray.shutdown()
     return trainer
     
     
@@ -212,11 +229,14 @@ def train(env,
     from grid2op.Reward import LinesCapacityReward  # or any other rewards
     from grid2op.Chronics import MultifolderWithCache  # highly recommended
     from lightsim2grid import LightSimBackend  # highly recommended for training !
-        
+    import ray
+    
+    
     env_name = "l2rpn_case14_sandbox"
     env = grid2op.make(env_name,
                        backend=LightSimBackend())
-        
+    
+    ray.init()
     try:
         train(env,
               iterations=10,  # any number of iterations you want
@@ -234,4 +254,4 @@ def train(env,
               )
     finally:
         env.close()
-
+        ray.shutdown()
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index 85582a0..32d4d03 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -28,7 +28,8 @@ def evaluate(env,
              save_gif=False,
              **kwargs):
     """
-    This function will use stable baselines 3 to train a PPO agent on
+    This function will use stable baselines 3 to evaluate a previously trained
+    PPO agent (with stable baselines 3) on
     a grid2op environment "env".
 
     It will use the grid2op "gym_compat" module to convert the action space
@@ -92,7 +93,6 @@ def evaluate(env,
 
         import grid2op
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
-        from grid2op.Chronics import MultifolderWithCache  # highly recommended
         from lightsim2grid import LightSimBackend  # highly recommended !
         from l2rpn_baselines.PPO_SB3 import evaluate
 
@@ -155,7 +155,9 @@ def evaluate(env,
     
     # create a grid2gop agent based on that (this will reload the save weights)
     full_path = os.path.join(load_path, name)
-    grid2op_agent = SB3Agent(env.action_space, gym_action_space, gym_observation_space,
+    grid2op_agent = SB3Agent(env.action_space,
+                             gym_action_space,
+                             gym_observation_space,
                              nn_path=os.path.join(full_path, name))
 
     # Build runner
@@ -196,9 +198,7 @@ def evaluate(env,
 
         import grid2op
         from grid2op.Reward import LinesCapacityReward  # or any other rewards
-        from grid2op.Chronics import MultifolderWithCache  # highly recommended
         from lightsim2grid import LightSimBackend  # highly recommended !
-        # from l2rpn_baselines.ppo_stablebaselines import evaluate
 
         nb_episode = 7
         nb_process = 1
@@ -212,12 +212,12 @@ def evaluate(env,
 
         try:
             evaluate(env,
-                    nb_episode=nb_episode,
-                    load_path="./saved_model", 
-                    name="test",
-                    nb_process=1,
-                    verbose=verbose,
-                    )
+                     nb_episode=nb_episode,
+                     load_path="./saved_model", 
+                     name="test",
+                     nb_process=1,
+                     verbose=verbose,
+                     )
 
             # you can also compare your agent with the do nothing agent relatively
             # easily
@@ -238,74 +238,3 @@ def evaluate(env,
                     print(msg_tmp)
         finally:
             env.close()
-
-    # import re
-    # from grid2op.Reward import LinesCapacityReward  # or any other rewards
-    # from lightsim2grid import LightSimBackend  # highly recommended !
-    # from grid2op.Chronics import MultifolderWithCache  # highly recommended
-
-    # env_name = "l2rpn_case14_sandbox"
-    # env = grid2op.make(env_name,
-    #                     reward_class=LinesCapacityReward,
-    #                     backend=LightSimBackend(),
-    #                     chronics_class=MultifolderWithCache)
-
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(".*00$", x) is not None)
-    # env.chronics_handler.real_data.reset()
-    # # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
-    # # for more information !
-
-    # try:
-    #     train(env,
-    #             iterations=10_000,  # any number of iterations you want
-    #             logs_dir="./logs",  # where the tensorboard logs will be put
-    #             save_path="./saved_model",  # where the NN weights will be saved
-    #             name="test",  # name of the baseline
-    #             net_arch=[100, 100, 100],  # architecture of the NN
-    #             save_every_xxx_steps=2000,  # save the NN every 2k steps
-    #             )
-    # finally:
-    #     env.close()
-
-    # import grid2op
-    # from grid2op.Action import CompleteAction
-    # from grid2op.Reward import L2RPNReward, EpisodeDurationReward, LinesCapacityReward
-    # from grid2op.gym_compat import GymEnv, DiscreteActSpace, BoxGymObsSpace
-    # from lightsim2grid import LightSimBackend
-    # from grid2op.Chronics import MultifolderWithCache
-    # import pdb
-
-    # nb_episode = 7
-    # nb_process = 1
-    # verbose = True
-
-    # env = grid2op.make("educ_case14_storage",
-    #                    test=True,
-    #                    action_class=CompleteAction,
-    #                    reward_class=LinesCapacityReward,
-    #                    backend=LightSimBackend())
-
-    # evaluate(env,
-    #          nb_episode=nb_episode,
-    #          load_path="./saved_model", 
-    #          name="test4",
-    #          nb_process=1,
-    #          verbose=verbose,
-    #          )
-
-    # # to compare with do nothing
-    # runner_params = env.get_params_for_runner()
-    # runner = Runner(**runner_params)
-
-    # res = runner.run(nb_episode=nb_episode,
-    #                  nb_process=nb_process
-    #                  )
-
-    # # Print summary
-    # if verbose:
-    #     print("Evaluation summary for DN:")
-    #     for _, chron_name, cum_reward, nb_time_step, max_ts in res:
-    #         msg_tmp = "chronics at: {}".format(chron_name)
-    #         msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
-    #         msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
-    #         print(msg_tmp)
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 3ff8cd8..fb4e01a 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -40,15 +40,17 @@ def __init__(self,
                  gym_obs_space,
                  *,  # to prevent positional argument
                  nn_path=None,
-                 nn_kwargs=None):
+                 nn_kwargs=None,
+                 _check_both_set=True,
+                 _check_none_set=True):
         super().__init__(g2op_action_space)
         self._gym_act_space = gym_act_space
         self._gym_obs_space = gym_obs_space
-        if nn_path is None and nn_kwargs is None:
+        if _check_none_set and (nn_path is None and nn_kwargs is None):
             raise RuntimeError("Impossible to build a GymAgent without providing at "
                                "least one of `nn_path` (to load the agent from disk) "
                                "or `nn_kwargs` (to create the underlying agent).")
-        if nn_path is not None and nn_kwargs is not None:
+        if _check_both_set and (nn_path is not None and nn_kwargs is not None):
             raise RuntimeError("Impossible to build a GymAgent by providing both "
                                "`nn_path` (*ie* you want load the agent from disk) "
                                "and `nn_kwargs` (*ie* you want to create the underlying agent from these "
diff --git a/setup.py b/setup.py
index 1d0d153..ae76aa0 100644
--- a/setup.py
+++ b/setup.py
@@ -34,26 +34,30 @@
                      "torch>=1.4.0",
                      "scikit-learn>=0.22.2",
                      "gym>=0.17.1"
-                     ]
+                     ],
+        "rllib": ["ray[rllib]",
+                  "jsonpickle",
+                  "lightsim2grid"],
+        "stable_baselines": ["stable_baselines3"]
     }
 }
 
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+    
 
 setup(name='l2rpn_baselines',
       version=__version__,
       description='L2RPN Baselines a repository to host ' \
       'baselines for l2rpn competitions.',
-      long_description='This repository aims at facilitating ' \
-      'the use of state of the art algorithm in coming from the ' \
-      'reinforcement learning community or the power system ' \
-      'community in the l2rpn competitions. It  also provides ' \
-      'some usefull function to make life or participants to the ' \
-      'l2rpn competitions easier.',
+      long_description=long_description,
+      long_description_content_type="text/markdown",
       classifiers=[
           'Development Status :: 4 - Beta',
           'Programming Language :: Python :: 3.6',
           'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8',
+          'Programming Language :: Python :: 3.9',
           "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
           "Intended Audience :: Developers",
           "Intended Audience :: Education",

From 608b7f50b1c603e8272fc8d6157b4767ee020e29 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 10:53:53 +0100
Subject: [PATCH 17/56] adding some docs and better api for ppo rllib

---
 docs/index.rst                          |  77 +++++----
 docs/ppo_rllib.rst                      | 199 ++++++++++++++++++++++++
 docs/ppo_stable_baselines.rst           |   4 +-
 l2rpn_baselines/DuelQLeapNet/train.py   |   1 +
 l2rpn_baselines/DuelQSimple/train.py    |   3 +-
 l2rpn_baselines/LeapNetEncoded/train.py |   3 +-
 l2rpn_baselines/PPO_RLLIB/evaluate.py   |   8 +-
 l2rpn_baselines/PPO_RLLIB/rllibagent.py | 147 +++++++++++++++++
 l2rpn_baselines/PPO_RLLIB/train.py      |  66 +++++---
 l2rpn_baselines/PPO_SB3/evaluate.py     |   5 +-
 l2rpn_baselines/PPO_SB3/train.py        |  29 ++--
 l2rpn_baselines/PPO_SB3/utils.py        | 120 ++++++++++++++
 12 files changed, 585 insertions(+), 77 deletions(-)
 create mode 100644 docs/ppo_rllib.rst

diff --git a/docs/index.rst b/docs/index.rst
index 57f036f..c945d15 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,9 +10,6 @@ Welcome to l2rpn-baselines's documentation!
 In this documentation we expose first what is this package about and how to contribute, and then which baselines
 are already implemented in this package.
 
-How to contribute
-------------------
-
 .. toctree::
    :maxdepth: 2
    :caption: How to contribute
@@ -20,55 +17,79 @@ How to contribute
    template
    donothing
 
-Baseline already Available
----------------------------
+Open source libraries
+----------------------
 
-These are the "baselines" that are available. Please note that each of these baselines
-is provided as an example of what can be achieved with grid2op.
+Lots of reinforcement learning algorithms are already implemented by state of
+the art libraries heavily maintained and updated. 
 
-It can serve a possible implementation for a usecase. At the moment, we do not provide
-baseline with hyper parameters tuned that performs correctly.
+We highly recommend to use such packages if you would like to apply reinforcement
+learning to the power grid control problem.
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Reference baselines
+   :maxdepth: 1
+   :caption: Open source libraries
 
-   utils
-   deepqsimple
-   doubleduelingdqn
-   duelqsimple
-   expertagent
    ppo_stable_baselines
+   ppo_rllib
 
+Other contributions
+---------------------
 
-More advanced baselines
-------------------------
+In this section, we grouped up some noticeable contributions for the powergrid control 
+problem. 
+
+These solutions comes either from past top performers of the l2rpn competitions, or
+from custom implementation of some published research performing well
+in some environment.
 
 .. toctree::
-   :maxdepth: 2
-   :caption: More advanced baselines and Contributions
+   :maxdepth: 1
+   :caption: Open source libraries
+
+   expertagent
+   external_contributions
+
+Custom implementation
+---------------------------
+
+.. note::
+   WORK IN PROGRESS
 
+For more "in depth" look at what is possible to do, we also wrote some 
+custom implementation of some reinforcement learning algorithms.
+
+We do not necessarily recommend to have a deep look at these packages. However,
+you can check them out if you need some inspiration of what can be done by
+using grid2op more closely that through the gym interface.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Reference baselines
+
+   utils
+   deepqsimple
+   doubleduelingdqn
+   duelqsimple
    duelqleapnet
    doubleduelingrdqn
    leapnetencoded
-   external_contributions
 
 
 Deprecated baselines
 ---------------------------
 
+.. warning::
+   These are "deprecated", won't be fixed / maintained and are not likely to work.
+
+This section is mainly here for "history".
+
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Deprecated baselines
 
    sacold
 
-
-Contributions
--------------
-
-TODO
-
 Indices and tables
 ==================
 
diff --git a/docs/ppo_rllib.rst b/docs/ppo_rllib.rst
new file mode 100644
index 0000000..725f720
--- /dev/null
+++ b/docs/ppo_rllib.rst
@@ -0,0 +1,199 @@
+.. currentmodule:: l2rpn_baselines.ppo_stablebaselines
+
+PPO: with ray/rllib
+===========================================================
+
+Description
+-----------
+This "baseline" aims at providing a code example on how to use an agent
+from the ray/rllib repository (see https://docs.ray.io/en/master/rllib/)
+with grid2op.
+
+It also serve a second goal, to show how to train a PPO agent to perform
+continuous actions on the powergrid (*eg* adjusting the generator value, either
+by applying `redispatching` kind of action for controlable generators or 
+by with `curtailment` on generator using new renewable energy sources - solar and wind
+or even to control the state of the storage units.)
+
+It is pretty much the same as the :class:`l2rpn_baselines.PPO_SB3` but uses
+rllib instead of stable Baselines3.
+
+Exported class
+--------------
+You can use this class with:
+
+.. code-block:: python
+
+    from l2rpn_baselines.PPO_RLLIB import train, evaluate, PPO_RLLIB
+
+Used a trained agent
+++++++++++++++++++++++
+
+You first need to train it:
+
+.. code-block:: python
+
+    import re
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended
+    from lightsim2grid import LightSimBackend  # highly recommended for training !
+    import ray
+    from l2rpn_baselines.PPO_RLLIB import train
+    
+    
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                       backend=LightSimBackend())
+    
+    ray.init()
+    try:
+        trained_aget = train(
+              env,
+              iterations=10,  # any number of iterations you want
+              save_path="./saved_model",  # where the NN weights will be saved
+              name="test",  # name of the baseline
+              net_arch=[100, 100, 100],  # architecture of the NN
+              save_every_xxx_steps=2,  # save the NN every 2 training steps
+              env_kwargs={"reward_class": LinesCapacityReward,
+                          "chronics_class": MultifolderWithCache,  # highly recommended
+                          "data_feeding_kwargs": {
+                              'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
+                              }
+              },
+              verbose=True
+              )
+    finally:
+        env.close()
+        ray.shutdown()
+
+Then you can load it:
+
+.. code-block:: python
+
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+    from l2rpn_baselines.PPO_RLLIB import evaluate
+
+    nb_episode = 7
+    nb_process = 1
+    verbose = True
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                        reward_class=LinesCapacityReward,
+                        backend=LightSimBackend()
+                        )
+
+    try:
+        trained_agent = evaluate(
+                 env,
+                 nb_episode=nb_episode,
+                 load_path="./saved_model",  # should be the same as what has been called in the train function !
+                 name="test3",  # should be the same as what has been called in the train function !
+                 nb_process=1,
+                 verbose=verbose,
+                 )
+
+        # you can also compare your agent with the do nothing agent relatively
+        # easily
+        runner_params = env.get_params_for_runner()
+        runner = Runner(**runner_params)
+
+        res = runner.run(nb_episode=nb_episode,
+                        nb_process=nb_process
+                        )
+
+        # Print summary
+        if verbose:
+            print("Evaluation summary for DN:")
+            for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                msg_tmp = "chronics at: {}".format(chron_name)
+                msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                print(msg_tmp)
+    finally:
+        env.close()
+
+
+Create an agent from scratch
+++++++++++++++++++++++++++++++
+
+For example, to create an agent **from scratch**, with some parameters:
+
+.. code-block:: python
+
+    import grid2op
+    from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace
+    from lightsim2grid import LightSimBackend
+    from l2rpn_baselines.PPO_RLLIB import PPO_RLLIB
+
+    env_name = "l2rpn_case14_sandbox"  # or any other name
+    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                        "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                        "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                        "storage_power", "storage_charge"]
+    act_attr_to_keep = ["redispatch"]
+    
+    # create the grid2op environment
+    env = grid2op.make(env_name, backend=LightSimBackend())
+    
+    # define the action space and observation space that your agent
+    # will be able to use
+    gym_observation_space = BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
+    gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+
+    # define the configuration for the environment
+    env_config = {"env_name": env.env_name,
+                  "backend_class": LightSimBackend,
+                  "obs_attr_to_keep": obs_attr_to_keep,
+                  "act_attr_to_keep": act_attr_to_keep, 
+                    # other type of parameters used in the "grid2op.make"
+                    # function eg:
+                    # "param": ...
+                    # "reward_class": ...
+                    # "other_reward": ...
+                    # "difficulty": ...
+                    }
+
+    # now define the configuration for the PPOTrainer
+    env_config_ppo = {
+        # config to pass to env class
+        "env_config": env_config,
+        #neural network config
+        "lr": 1e-4, # learning_rate
+        "model": {
+            "fcnet_hiddens": [100, 100, 100],  # neural net architecture
+        },
+        # other keyword arguments
+    }
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    grid2op_agent = RLLIBAgent(env.action_space,
+                                gym_action_space,
+                                gym_observation_space,
+                                nn_config=env_config_ppo,
+                                nn_path=None  # don't load it from anywhere
+                                )
+    
+    # use it
+    obs = env.reset()
+    reward = env.reward_range[0]
+    done = False
+    grid2op_act = grid2op_agent.act(obs, reward, done)
+    obs, reward, done, info = env.step(grid2op_act)
+    
+
+.. note::
+    The agent above is NOT trained. So it will basically output "random" actions.
+
+    You should probably train it before hand (see the `train` function)
+
+
+Detailed documentation
+++++++++++++++++++++++++
+
+.. automodule:: l2rpn_baselines.PPO_SB3
+    :members:
+    :autosummary:
diff --git a/docs/ppo_stable_baselines.rst b/docs/ppo_stable_baselines.rst
index 1966963..a60061b 100644
--- a/docs/ppo_stable_baselines.rst
+++ b/docs/ppo_stable_baselines.rst
@@ -1,12 +1,12 @@
 .. currentmodule:: l2rpn_baselines.ppo_stablebaselines
 
-PPO Stable Baselines
+PPO: with stable-baselines3
 ===========================================================
 
 Description
 -----------
 This "baseline" aims at providing a code example on how to use an agent
-from the Sable Baselines repository (see https://stable-baselines3.readthedocs.io/en/master/)
+from the Sable Baselines3 repository (see https://stable-baselines3.readthedocs.io/en/master/)
 with grid2op.
 
 It also serve a second goal, to show how to train a PPO agent to perform
diff --git a/l2rpn_baselines/DuelQLeapNet/train.py b/l2rpn_baselines/DuelQLeapNet/train.py
index e92bf1c..e8d8355 100755
--- a/l2rpn_baselines/DuelQLeapNet/train.py
+++ b/l2rpn_baselines/DuelQLeapNet/train.py
@@ -208,6 +208,7 @@ def train(env,
     # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
     # recommended to save the "baseline" at the end of this function with:
     # baseline.save(path_save)
+    return baseline
 
 
 if __name__ == "__main__":
diff --git a/l2rpn_baselines/DuelQSimple/train.py b/l2rpn_baselines/DuelQSimple/train.py
index 9caea6c..fa806b1 100755
--- a/l2rpn_baselines/DuelQSimple/train.py
+++ b/l2rpn_baselines/DuelQSimple/train.py
@@ -195,6 +195,7 @@ def train(env,
     # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
     # recommended to save the "baseline" at the end of this function with:
     # baseline.save(path_save)
+    return baseline
 
 
 if __name__ == "__main__":
@@ -205,7 +206,7 @@ def train(env,
     from grid2op.Reward import L2RPNReward
     import re
     try:
-        from lightsim2grid.LightSimBackend import LightSimBackend
+        from lightsim2grid import LightSimBackend
         backend = LightSimBackend()
     except:
         from grid2op.Backend import PandaPowerBackend
diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py
index b48092e..6e7dc91 100755
--- a/l2rpn_baselines/LeapNetEncoded/train.py
+++ b/l2rpn_baselines/LeapNetEncoded/train.py
@@ -8,6 +8,7 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+from email.mime import base
 import os
 import warnings
 
@@ -208,7 +209,7 @@ def train(env,
     # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
     # recommended to save the "baseline" at the end of this function with:
     # baseline.save(path_save)
-
+    return baseline
 
 if __name__ == "__main__":
     # import grid2op
diff --git a/l2rpn_baselines/PPO_RLLIB/evaluate.py b/l2rpn_baselines/PPO_RLLIB/evaluate.py
index 58a021f..007052b 100644
--- a/l2rpn_baselines/PPO_RLLIB/evaluate.py
+++ b/l2rpn_baselines/PPO_RLLIB/evaluate.py
@@ -17,8 +17,8 @@
 from l2rpn_baselines.PPO_RLLIB.rllibagent import RLLIBAgent
 
 def evaluate(env,
-             load_path=".",
              name="ppo_rllib",
+             load_path=".",
              logs_path=None,
              nb_episode=1,
              nb_process=1,
@@ -83,9 +83,7 @@ def evaluate(env,
     Examples
     ---------
 
-    Here is an example on how to train a ppo_stablebaseline .
-
-    First define a python script, for example
+    Here is an example on how to evaluate a PPO agent (trained using RLLIB):
 
     .. code-block:: python
 
@@ -217,7 +215,7 @@ def evaluate(env,
         evaluate(env,
                  nb_episode=nb_episode,
                  load_path="./saved_model",  # should be the same as what has been called in the train function !
-                 name="test",  # should be the same as what has been called in the train function !
+                 name="test3",  # should be the same as what has been called in the train function !
                  nb_process=1,
                  verbose=verbose,
                  )
diff --git a/l2rpn_baselines/PPO_RLLIB/rllibagent.py b/l2rpn_baselines/PPO_RLLIB/rllibagent.py
index 4de4a0d..97ce708 100644
--- a/l2rpn_baselines/PPO_RLLIB/rllibagent.py
+++ b/l2rpn_baselines/PPO_RLLIB/rllibagent.py
@@ -54,6 +54,92 @@ class RLLIBAgent(GymAgent):
         from l2rpn_baselines.PPO_RLLIB import Env_RLLIB
         PPOTrainer(env=Env_RLLIB, config=nn_config)
     
+    Examples
+    ---------
+    The best way to have such an agent is either to train it:
+    
+    .. code-block:: python
+    
+        from l2rpn_baselnes.PPO_RLLIB import train
+        agent = train(...)  # see the doc of the `train` function !
+        
+    Or you can also load it when you evaluate it (after it has been trained !):
+    
+    .. code-block:: python
+    
+        from l2rpn_baselnes.PPO_RLLIB import evaluate
+        agent = evaluate(...)  # see the doc of the `evaluate` function !
+        
+    To create such an agent from scratch (NOT RECOMMENDED), you can do:
+    
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace, GymEnv
+        from lightsim2grid import LightSimBackend
+        
+        from l2rpn_baselnes.PPO_RLLIB import RLLIBAgent
+            
+        env_name = "l2rpn_case14_sandbox"  # or any other name
+        
+        # customize the observation / action you want to keep
+        obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                            "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                            "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                            "storage_power", "storage_charge"]
+        act_attr_to_keep = ["redispatch"]
+        
+        # create the grid2op environment
+        env = grid2op.make(env_name, backend=LightSimBackend())
+        
+        # define the action space and observation space that your agent
+        # will be able to use
+        gym_observation_space = BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
+        gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+
+        # define the configuration for the environment
+        env_config = {"env_name": env.env_name,
+                      "backend_class": LightSimBackend,
+                      "obs_attr_to_keep": obs_attr_to_keep,
+                      "act_attr_to_keep": act_attr_to_keep, 
+                      # other type of parameters used in the "grid2op.make"
+                      # function eg:
+                      # "param": ...
+                      # "reward_class": ...
+                      # "other_reward": ...
+                      # "difficulty": ...
+                      }
+
+        # now define the configuration for the PPOTrainer
+        env_config_ppo = {
+            # config to pass to env class
+            "env_config": env_config,
+            #neural network config
+            "lr": 1e-4, # learning_rate
+            "model": {
+                "fcnet_hiddens": [100, 100, 100],  # neural net architecture
+            },
+            # other keyword arguments
+        }
+        
+        # create a grid2gop agent based on that (this will reload the save weights)
+        grid2op_agent = RLLIBAgent(env.action_space,
+                                    gym_action_space,
+                                    gym_observation_space,
+                                    nn_config=env_config_ppo,
+                                    nn_path=None  # don't load it from anywhere
+                                    )
+        
+        # use it
+        obs = env.reset()
+        reward = env.reward_range[0]
+        done = False
+        grid2op_act = grid2op_agent.act(obs, reward, done)
+        obs, reward, done, info = env.step(grid2op_act)
+        
+        # NB: the agent above is NOT trained ! So it's likely to output "random" 
+        # actions !
+                                   
     """
     def __init__(self,
                  g2op_action_space,
@@ -136,3 +222,64 @@ def build(self):
             
         """
         self.nn_model = PPOTrainer(**self._nn_kwargs)
+
+if __name__ == "__main__":
+    import grid2op
+    from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace
+    from lightsim2grid import LightSimBackend
+    
+    env_name = "l2rpn_case14_sandbox"  # or any other name
+    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                        "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                        "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                        "storage_power", "storage_charge"]
+    act_attr_to_keep = ["redispatch"]
+    
+    # create the grid2op environment
+    env = grid2op.make(env_name, backend=LightSimBackend())
+    
+    # define the action space and observation space that your agent
+    # will be able to use
+    gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
+    gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+
+    # define the configuration for the environment
+    env_config = {"env_name": env.env_name,
+                  "backend_class": LightSimBackend,
+                  "obs_attr_to_keep": obs_attr_to_keep,
+                  "act_attr_to_keep": act_attr_to_keep, 
+                    # other type of parameters used in the "grid2op.make"
+                    # function eg:
+                    # "param": ...
+                    # "reward_class": ...
+                    # "other_reward": ...
+                    # "difficulty": ...
+                    }
+
+    # now define the configuration for the PPOTrainer
+    env_config_ppo = {
+        # config to pass to env class
+        "env_config": env_config,
+        #neural network config
+        "lr": 1e-4, # learning_rate
+        "model": {
+            "fcnet_hiddens": [100, 100, 100],  # neural net architecture
+        },
+        # other keyword arguments
+    }
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    grid2op_agent = RLLIBAgent(env.action_space,
+                                gym_action_space,
+                                gym_observation_space,
+                                nn_config=env_config_ppo,
+                                nn_path=None  # don't load it from anywhere
+                                )
+    
+    # use it
+    obs = env.reset()
+    reward = env.reward_range[0]
+    done = False
+    grid2op_act = grid2op_agent.act(obs, reward, done)
+    obs, reward, done, info = env.step(grid2op_act)
+    
\ No newline at end of file
diff --git a/l2rpn_baselines/PPO_RLLIB/train.py b/l2rpn_baselines/PPO_RLLIB/train.py
index 268db99..280a0ac 100644
--- a/l2rpn_baselines/PPO_RLLIB/train.py
+++ b/l2rpn_baselines/PPO_RLLIB/train.py
@@ -9,12 +9,15 @@
 import os
 import grid2op
 import copy
+from grid2op.gym_compat import GymEnv, BoxGymObsSpace, BoxGymActSpace
+
 from l2rpn_baselines.PPO_RLLIB.env_rllib import Env_RLLIB
 from l2rpn_baselines.PPO_SB3 import (default_obs_attr_to_keep, 
                                      default_act_attr_to_keep,
                                      save_used_attribute,
                                      remove_non_usable_attr
                                     )
+from l2rpn_baselines.PPO_RLLIB.rllibagent import RLLIBAgent
 
 try:
     import ray
@@ -24,11 +27,12 @@
 except ImportError as exc_:
     _CAN_USE_RLLIB = False
 
+
 def train(env,
           name="ppo_rllib",
           iterations=1,
           save_path=None,
-          load_path=None,  # TODO
+          load_path=None,
           net_arch=None,
           learning_rate=3e-4,
           verbose=False,
@@ -147,19 +151,19 @@ def train(env,
         ray.init()  # if needed (you might have it already working somewhere)
         try:
             train(env,
-                iterations=10,  # any number of iterations you want
-                save_path="./saved_model",  # where the NN weights will be saved
-                name="test",  # name of the baseline
-                net_arch=[100, 100, 100],  # architecture of the NN
-                save_every_xxx_steps=2,  # save the NN every 2 training steps
-                env_kwargs={"reward_class": LinesCapacityReward,
-                            "chronics_class": MultifolderWithCache,  # highly recommended
-                            "data_feeding_kwargs": {
-                                'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
-                                }
-                },
-                verbose=True
-                )
+                  iterations=10,  # any number of iterations you want
+                  save_path="./saved_model",  # where the NN weights will be saved
+                  name="test",  # name of the baseline
+                  net_arch=[100, 100, 100],  # architecture of the NN
+                  save_every_xxx_steps=2,  # save the NN every 2 training steps
+                  env_kwargs={"reward_class": LinesCapacityReward,
+                              "chronics_class": MultifolderWithCache,  # highly recommended
+                              "data_feeding_kwargs": {
+                                  'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
+                                  }
+                  },
+                  verbose=True
+                  )
         finally:
             env.close()
             ray.shutdown()  # if needed (you might have it already working somewhere)
@@ -187,8 +191,8 @@ def train(env,
     env_params = env.get_kwargs()
     env_config = {"env_name": env.env_name,
                   "backend_class": env_params["_raw_backend_class"],
-                  "obs_attr_to_keep": default_obs_attr_to_keep,
-                  "act_attr_to_keep": default_act_attr_to_keep, 
+                  "obs_attr_to_keep": obs_attr_to_keep,
+                  "act_attr_to_keep": act_attr_to_keep, 
                   **env_kwargs}
     
     env_config_ppo = {
@@ -201,26 +205,40 @@ def train(env,
         },
         **kwargs
     }
-    
+        
     # store it
     encoded = jsonpickle.encode(env_config_ppo)
     with open(os.path.join(path_expe, "env_config.json"), "w", encoding="utf-8") as f:
         f.write(encoded)
     
+    # define the gym environment from the grid2op env
+    env_gym = GymEnv(env)
+    env_gym.observation_space.close()
+    env_gym.observation_space = BoxGymObsSpace(env.observation_space,
+                                               attr_to_keep=obs_attr_to_keep)
+    env_gym.action_space.close()
+    env_gym.action_space = BoxGymActSpace(env.action_space,
+                                          attr_to_keep=act_attr_to_keep)
     # then define a "trainer"
-    # TODO what if we want to restore it !
-    trainer = ppo.PPOTrainer(env=Env_RLLIB, config=env_config_ppo)
+    agent = RLLIBAgent(g2op_action_space=env.action_space,
+                       gym_act_space=env_gym.action_space,
+                       gym_obs_space=env_gym.observation_space,
+                       nn_config=env_config_ppo,
+                       nn_path=load_path)
+    
     for step in range(iterations):
         # Perform one iteration of training the policy with PPO
-        result = trainer.train()
+        result = agent.nn_model.train()
         if verbose:
             print(pretty_print(result))
 
         if need_saving and step % save_every_xxx_steps == 0:
-            checkpoint = trainer.save(checkpoint_dir=path_expe)
+            agent.nn_model.save(checkpoint_dir=path_expe)
             
-    checkpoint = trainer.save(checkpoint_dir=path_expe)
-    return trainer
+    if need_saving:
+        agent.nn_model.save(checkpoint_dir=path_expe)
+        
+    return agent
     
     
 if __name__ == "__main__":
@@ -241,7 +259,7 @@ def train(env,
         train(env,
               iterations=10,  # any number of iterations you want
               save_path="./saved_model",  # where the NN weights will be saved
-              name="test",  # name of the baseline
+              name="test3",  # name of the baseline
               net_arch=[100, 100, 100],  # architecture of the NN
               save_every_xxx_steps=2,  # save the NN every 2 training steps
               env_kwargs={"reward_class": LinesCapacityReward,
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index 32d4d03..ac011aa 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -85,9 +85,8 @@ def evaluate(env,
     Examples
     ---------
 
-    Here is an example on how to train a ppo_stablebaseline .
-
-    First define a python script, for example
+    Here is an example on how to evaluate an  PPO agent (previously trained
+    with stable baselines3):
 
     .. code-block:: python
 
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index 78af502..a948d97 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -179,10 +179,11 @@ def train(env,
     # define the gym environment from the grid2op env
     env_gym = GymEnv(env)
     env_gym.observation_space.close()
-    env_gym.observation_space =  BoxGymObsSpace(env.observation_space,
-                                                attr_to_keep=obs_attr_to_keep)
+    env_gym.observation_space = BoxGymObsSpace(env.observation_space,
+                                               attr_to_keep=obs_attr_to_keep)
     env_gym.action_space.close()
-    env_gym.action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+    env_gym.action_space = BoxGymActSpace(env.action_space,
+                                          attr_to_keep=act_attr_to_keep)
 
 
     # Save a checkpoint every 1000 steps
@@ -205,26 +206,28 @@ def train(env,
         if logs_dir is not None:
             if not os.path.exists(logs_dir):
                 os.mkdir(logs_dir)
-        model = PPO(model_policy,
-                    env_gym,
-                    verbose=1,
-                    learning_rate=learning_rate,
-                    tensorboard_log=os.path.join(logs_dir, name),
-                    policy_kwargs=policy_kwargs,
-                    **kwargs)
+        # model = PPO(model_policy,
+        #             env_gym,
+        #             verbose=1,
+        #             learning_rate=learning_rate,
+        #             tensorboard_log=os.path.join(logs_dir, name),
+        #             policy_kwargs=policy_kwargs,
+        #             **kwargs)
+        agent = ...
     else:
-        # TODO !
         model = PPO.load(os.path.join(load_path, name))
+        agent = ...
 
     # train it
-    model.learn(total_timesteps=iterations,
-                callback=checkpoint_callback)
+    agent.nn_model.learn(total_timesteps=iterations,
+                         callback=checkpoint_callback)
 
     # save it
     if save_path is not None:
         model.save(os.path.join(my_path, name))
 
     env_gym.close()
+    return agent  # TODO
 
 if __name__ == "__main__":
 
diff --git a/l2rpn_baselines/PPO_SB3/utils.py b/l2rpn_baselines/PPO_SB3/utils.py
index 72481b2..6f1b225 100644
--- a/l2rpn_baselines/PPO_SB3/utils.py
+++ b/l2rpn_baselines/PPO_SB3/utils.py
@@ -131,6 +131,75 @@ class SB3Agent(GymAgent):
     - `nn_kwargs`: the parameters used to build the neural network from scratch.
     
     Exactly one of `nn_path` and `nn_kwargs` should be provided. No more, no less.
+    
+    Examples
+    ---------
+    
+    The best way to have such an agent is either to train it:
+    
+    .. code-block:: python
+    
+        from l2rpn_baselnes.PPO_SB3 import train
+        agent = train(...)  # see the doc of the `train` function !
+        
+    Or you can also load it when you evaluate it (after it has been trained !):
+    
+    .. code-block:: python
+    
+        from l2rpn_baselnes.PPO_SB3 import evaluate
+        agent = evaluate(...)  # see the doc of the `evaluate` function !
+        
+    To create such an agent from scratch (NOT RECOMMENDED), you can do:
+    
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace, GymEnv
+        from lightsim2grid import LightSimBackend
+        
+        from l2rpn_baselnes.PPO_SB3 import PPO_SB3
+            
+        env_name = "l2rpn_case14_sandbox"  # or any other name
+        
+        # customize the observation / action you want to keep
+        obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                            "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                            "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                            "storage_power", "storage_charge"]
+        act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
+        
+        # create the grid2op environment
+        env = grid2op.make(env_name, backend=LightSimBackend())
+        
+        # define the action space and observation space that your agent
+        # will be able to use
+        env_gym = GymEnv(env)
+        env_gym.observation_space.close()
+        env_gym.observation_space = BoxGymObsSpace(env.observation_space,
+                                                attr_to_keep=obs_attr_to_keep)
+        env_gym.action_space.close()
+        env_gym.action_space = BoxGymActSpace(env.action_space,
+                                            attr_to_keep=act_attr_to_keep)
+        
+        # create the key word arguments used for the NN
+        nn_kwargs = {
+            "policy": MlpPolicy,
+            "env": env_gym,
+            "verbose": 0,
+            "learning_rate": 1e-3,
+            "tensorboard_log": ...,
+            "policy_kwargs": {
+                "net_arch": [100, 100, 100]
+            }
+        }
+        
+        # create a grid2gop agent based on that (this will reload the save weights)
+        grid2op_agent = PPO_SB3(env.action_space,
+                                env_gym.action_space,
+                                env_gym.observation_space,
+                                nn_kwargs=nn_kwargs  # don't load it from anywhere
+                               )
+        
     """
     def __init__(self,
                  g2op_action_space,
@@ -187,3 +256,54 @@ def build(self):
             PPO(**nn_kwargs)
         """
         self.nn_model = PPO(**self._nn_kwargs)
+
+if __name__ == "__main__":
+    PPO_SB3 = SB3Agent
+    
+    import grid2op
+    from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace, GymEnv
+    from lightsim2grid import LightSimBackend
+    from stable_baselines3.ppo import MlpPolicy
+    
+    # from l2rpn_baselnes.PPO_SB3 import PPO_SB3
+        
+    env_name = "l2rpn_case14_sandbox"  # or any other name
+    
+    # customize the observation / action you want to keep
+    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                        "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                        "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                        "storage_power", "storage_charge"]
+    act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
+    
+    # create the grid2op environment
+    env = grid2op.make(env_name, backend=LightSimBackend())
+    
+    # define the action space and observation space that your agent
+    # will be able to use
+    env_gym = GymEnv(env)
+    env_gym.observation_space.close()
+    env_gym.observation_space = BoxGymObsSpace(env.observation_space,
+                                               attr_to_keep=obs_attr_to_keep)
+    env_gym.action_space.close()
+    env_gym.action_space = BoxGymActSpace(env.action_space,
+                                          attr_to_keep=act_attr_to_keep)
+    
+    # create the key word arguments used for the NN
+    nn_kwargs = {
+        "policy": MlpPolicy,
+        "env": env_gym,
+        "verbose": 0,
+        "learning_rate": 1e-3,
+        "tensorboard_log": ...,
+        "policy_kwargs": {
+            "net_arch": [100, 100, 100]
+        }
+    }
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    grid2op_agent = PPO_SB3(env.action_space,
+                            env_gym.action_space,
+                            env_gym.observation_space,
+                            nn_kwargs=nn_kwargs  # don't load it from anywhere
+                           )

From 53827a5a329b299f0606e609cea4e5a42b4888d7 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 11:16:11 +0100
Subject: [PATCH 18/56] doc should be finished for ppo implementations

---
 docs/index.rst                      |   2 +-
 docs/ppo_stable_baselines.rst       | 174 ++++++++++++++++++++--------
 l2rpn_baselines/PPO_SB3/evaluate.py |  79 ++++++-------
 l2rpn_baselines/PPO_SB3/train.py    |  50 +++++---
 4 files changed, 199 insertions(+), 106 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index c945d15..dab24fc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -30,8 +30,8 @@ learning to the power grid control problem.
    :maxdepth: 1
    :caption: Open source libraries
 
-   ppo_stable_baselines
    ppo_rllib
+   ppo_stable_baselines
 
 Other contributions
 ---------------------
diff --git a/docs/ppo_stable_baselines.rst b/docs/ppo_stable_baselines.rst
index a60061b..3a48cb6 100644
--- a/docs/ppo_stable_baselines.rst
+++ b/docs/ppo_stable_baselines.rst
@@ -25,76 +25,148 @@ You can use this class with:
     from l2rpn_baselines.PPO_SB3 import train, evaluate, PPO_SB3
 
 
-Create an agent from scratch
-++++++++++++++++++++++++++++++
+Used a trained agent
+++++++++++++++++++++++
 
-For example, to create an agent from scratch, with some parameters:
+You first need to train it:
 
 .. code-block:: python
 
+    import re
     import grid2op
-    from grid2op.gym_compat import GymEnv, BoxGymActSpace
-    from l2rpn_baselines.PPO_SB3 import PPO_SB3
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
+    from l2rpn_baselines.PPO_SB3 import train
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                       reward_class=LinesCapacityReward,
+                       backend=LightSimBackend(),
+                       chronics_class=MultifolderWithCache)
+
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*0$", x) is not None)
+    env.chronics_handler.real_data.reset()
+    # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
+    # for more information !
+    train(env,
+          iterations=1_000,
+          logs_dir="./logs",
+          save_path="./saved_model", 
+          name="test",
+          net_arch=[200, 200, 200],
+          save_every_xxx_steps=2000,
+          )
+
+Then you can load it:
 
-    # create the grid2op environment
-    env = grid2op.make(...)
-    #############
+.. code-block:: python
 
-    # convert it to a suitable gym environment
-    env_gym = GymEnv(env)
-    env_gym.action_space.close()
-    env_gym.action_space = BoxGymActSpace(env.action_space)
-    #############
-
-    # create the PPO Stable Baselines agent (only some basic configs are given here)
-    agent = PPO_SB3(env.action_space,
-                    env_gym.action_space,
-                    env_gym.observation_space,
-                    nn_kwargs={
-                        "policy": MlpPolicy,  # or any other stable baselines 3 policy
-                        "env": env_gym,
-                        "verbose": 1,  # or anything else
-                        "learning_rate": 3e-4,  # you can change that
-                        "policy_kwargs": {
-                            "net_arch": [100, 100, 100]  # and that
-                        }
-                    },
-                    nn_path=None
-                   )
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+    from l2rpn_baselines.PPO_SB3 import evaluate
+
+    nb_episode = 7
+    nb_process = 1
+    verbose = True
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                       reward_class=LinesCapacityReward,
+                       backend=LightSimBackend()
+                      )
+
+    try:
+        trained_agent, res_eval = evaluate(
+                    env,
+                    nb_episode=nb_episode,
+                    load_path="./saved_model", 
+                    name="test4",
+                    nb_process=1,
+                    verbose=verbose,
+                    )
 
-.. note::
-    The agent above is NOT trained. So it will basically output "random" actions.
+        # you can also compare your agent with the do nothing agent relatively
+        # easily
+        runner_params = env.get_params_for_runner()
+        runner = Runner(**runner_params)
 
-    You should probably train it before hand (see the `train` function)
+        res = runner.run(nb_episode=nb_episode,
+                        nb_process=nb_process
+                        )
 
-Load a trained agent
-+++++++++++++++++++++++
-You can also load a trained agent, to use it with a grid2op environment, in a runner,
-in grid2game or any other frameworks related to grid2op.
+        # Print summary
+        if verbose:
+            print("Evaluation summary for DN:")
+            for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                msg_tmp = "chronics at: {}".format(chron_name)
+                msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                print(msg_tmp)
+    finally:
+        env.close()
 
 
+Create an agent from scratch
+++++++++++++++++++++++++++++++
+
+For example, to create an agent **from scratch**, with some parameters:
+
 .. code-block:: python
 
     import grid2op
-    from grid2op.gym_compat import GymEnv, BoxGymActSpace
+    from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace
+    from lightsim2grid import LightSimBackend
+    from stable_baselines3.ppo import MlpPolicy
     from l2rpn_baselines.PPO_SB3 import PPO_SB3
-
+        
+    env_name = "l2rpn_case14_sandbox"  # or any other name
+    
+    # customize the observation / action you want to keep
+    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                        "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                        "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
+                        "storage_power", "storage_charge"]
+    act_attr_to_keep = ["redispatch"]
+    
     # create the grid2op environment
-    env = grid2op.make(...)
-    #############
-
-    # convert it to a suitable gym environment
+    env = grid2op.make(env_name, backend=LightSimBackend())
+    
+    # define the action space and observation space that your agent
+    # will be able to use
     env_gym = GymEnv(env)
+    env_gym.observation_space.close()
+    env_gym.observation_space = BoxGymObsSpace(env.observation_space,
+                                               attr_to_keep=obs_attr_to_keep)
     env_gym.action_space.close()
-    env_gym.action_space = BoxGymActSpace(env.action_space)
-    #############
-
-    # create the PPO Stable Baselines agent (only some basic configs are given here)
-    agent = PPO_SB3(env.action_space,
-                    env_gym.action_space,
-                    env_gym.observation_space,
-                    nn_path=...  # path where you saved it !
-                    )
+    env_gym.action_space = BoxGymActSpace(env.action_space,
+                                          attr_to_keep=act_attr_to_keep)
+    
+    # create the key word arguments used for the NN
+    nn_kwargs = {
+        "policy": MlpPolicy,
+        "env": env_gym,
+        "verbose": 0,
+        "learning_rate": 1e-3,
+        "tensorboard_log": ...,
+        "policy_kwargs": {
+            "net_arch": [100, 100, 100]
+        }
+    }
+    
+    # create a grid2gop agent based on that (this will reload the save weights)
+    grid2op_agent = PPO_SB3(env.action_space,
+                            env_gym.action_space,
+                            env_gym.observation_space,
+                            nn_kwargs=nn_kwargs
+                           )
+    
+
+.. note::
+    The agent above is NOT trained. So it will basically output "random" actions.
+
+    You should probably train it before hand (see the `train` function)
 
 
 Detailed documentation
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index ac011aa..b66a513 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -195,45 +195,46 @@ def evaluate(env,
 
 if __name__ == "__main__":
 
-        import grid2op
-        from grid2op.Reward import LinesCapacityReward  # or any other rewards
-        from lightsim2grid import LightSimBackend  # highly recommended !
-
-        nb_episode = 7
-        nb_process = 1
-        verbose = True
-
-        env_name = "l2rpn_case14_sandbox"
-        env = grid2op.make(env_name,
-                           reward_class=LinesCapacityReward,
-                           backend=LightSimBackend()
-                           )
-
-        try:
-            evaluate(env,
-                     nb_episode=nb_episode,
-                     load_path="./saved_model", 
-                     name="test",
-                     nb_process=1,
-                     verbose=verbose,
-                     )
+    import grid2op
+    from grid2op.Reward import LinesCapacityReward  # or any other rewards
+    from lightsim2grid import LightSimBackend  # highly recommended !
+
+    nb_episode = 7
+    nb_process = 1
+    verbose = True
+
+    env_name = "l2rpn_case14_sandbox"
+    env = grid2op.make(env_name,
+                        reward_class=LinesCapacityReward,
+                        backend=LightSimBackend()
+                        )
+
+    try:
+        trained_agent, res_eval = evaluate(
+                    env,
+                    nb_episode=nb_episode,
+                    load_path="./saved_model", 
+                    name="test4",
+                    nb_process=1,
+                    verbose=verbose,
+                    )
 
-            # you can also compare your agent with the do nothing agent relatively
-            # easily
-            runner_params = env.get_params_for_runner()
-            runner = Runner(**runner_params)
+        # you can also compare your agent with the do nothing agent relatively
+        # easily
+        runner_params = env.get_params_for_runner()
+        runner = Runner(**runner_params)
 
-            res = runner.run(nb_episode=nb_episode,
-                            nb_process=nb_process
-                            )
+        res = runner.run(nb_episode=nb_episode,
+                        nb_process=nb_process
+                        )
 
-            # Print summary
-            if verbose:
-                print("Evaluation summary for DN:")
-                for _, chron_name, cum_reward, nb_time_step, max_ts in res:
-                    msg_tmp = "chronics at: {}".format(chron_name)
-                    msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
-                    msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
-                    print(msg_tmp)
-        finally:
-            env.close()
+        # Print summary
+        if verbose:
+            print("Evaluation summary for DN:")
+            for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+                msg_tmp = "chronics at: {}".format(chron_name)
+                msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+                msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+                print(msg_tmp)
+    finally:
+        env.close()
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index a948d97..7e84028 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -7,6 +7,7 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import pdb
+from tabnanny import verbose
 import warnings
 import copy
 import os
@@ -15,6 +16,8 @@
 
 from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
 
+from l2rpn_baselines.PPO_SB3.utils import SB3Agent
+
 try:
     from stable_baselines3.common.callbacks import CheckpointCallback
     from stable_baselines3 import PPO
@@ -47,6 +50,7 @@ def train(env,
           model_policy=MlpPolicy,
           obs_attr_to_keep=copy.deepcopy(default_obs_attr_to_keep),
           act_attr_to_keep=copy.deepcopy(default_act_attr_to_keep),
+          policy_kwargs=None,
           **kwargs):
     """
     This function will use stable baselines 3 to train a PPO agent on
@@ -113,6 +117,10 @@ def train(env,
     verbose: ``bool``
         If you want something to be printed on the terminal (a better logging strategy will be put at some point)
 
+    policy_kwargs: ``dict``
+        extra parameters passed to the PPO "policy_kwargs" key word arguments
+        (defaults to ``None``)
+        
     kwargs:
         extra parameters passed to the PPO from stable baselines 3
 
@@ -153,7 +161,8 @@ def train(env,
         # for more information !
 
         try:
-            train(env,
+            trained_agent = train(
+                  env,
                   iterations=10_000,  # any number of iterations you want
                   logs_dir="./logs",  # where the tensorboard logs will be put
                   save_path="./saved_model",  # where the NN weights will be saved
@@ -200,23 +209,34 @@ def train(env,
 
     # define the policy
     if load_path is None:
-        policy_kwargs = {}
+        if policy_kwargs is None:
+            policy_kwargs = {}
         if net_arch is not None:
             policy_kwargs["net_arch"] = net_arch
         if logs_dir is not None:
             if not os.path.exists(logs_dir):
                 os.mkdir(logs_dir)
-        # model = PPO(model_policy,
-        #             env_gym,
-        #             verbose=1,
-        #             learning_rate=learning_rate,
-        #             tensorboard_log=os.path.join(logs_dir, name),
-        #             policy_kwargs=policy_kwargs,
-        #             **kwargs)
-        agent = ...
-    else:
-        model = PPO.load(os.path.join(load_path, name))
-        agent = ...
+                
+        nn_kwargs = {
+            "policy": model_policy,
+            "env": env_gym,
+            "verbose": verbose,
+            "learning_rate": learning_rate,
+            "tensorboard_log": logs_dir,
+            "policy_kwargs": policy_kwargs,
+            **kwargs
+        }
+        agent = SB3Agent(env.action_space,
+                         env_gym.action_space,
+                         env_gym.observation_space,
+                         nn_kwargs=nn_kwargs,
+        )
+    else:        
+        agent = SB3Agent(env.action_space,
+                         env_gym.action_space,
+                         env_gym.observation_space,
+                         nn_path=os.path.join(load_path, name)
+        )
 
     # train it
     agent.nn_model.learn(total_timesteps=iterations,
@@ -224,7 +244,7 @@ def train(env,
 
     # save it
     if save_path is not None:
-        model.save(os.path.join(my_path, name))
+        agent.nn_model.save(os.path.join(my_path, name))
 
     env_gym.close()
     return agent  # TODO
@@ -251,7 +271,7 @@ def train(env,
           iterations=1_000,
           logs_dir="./logs",
           save_path="./saved_model", 
-          name="test3",
+          name="test4",
           net_arch=[200, 200, 200],
           save_every_xxx_steps=2000,
           )

From ae1187fee08bf1e33100a8b09d304454a6f8c6be Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 11:55:17 +0100
Subject: [PATCH 19/56] adding some tests for the PPO implementations

---
 l2rpn_baselines/PPO_RLLIB/__init__.py   |  9 +++
 l2rpn_baselines/PPO_RLLIB/train.py      | 12 ++--
 l2rpn_baselines/test/test_import.py     |  4 ++
 l2rpn_baselines/test/test_train_eval.py | 78 +++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/l2rpn_baselines/PPO_RLLIB/__init__.py b/l2rpn_baselines/PPO_RLLIB/__init__.py
index c30990a..0209628 100644
--- a/l2rpn_baselines/PPO_RLLIB/__init__.py
+++ b/l2rpn_baselines/PPO_RLLIB/__init__.py
@@ -6,3 +6,12 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+__all__ = [
+    "evaluate",
+    "train",
+    "PPO_RLLIB"
+]
+
+from l2rpn_baselines.PPO_RLLIB.rllibagent import RLLIBAgent as PPO_RLLIB
+from l2rpn_baselines.PPO_RLLIB.evaluate import evaluate
+from l2rpn_baselines.PPO_RLLIB.train import train
diff --git a/l2rpn_baselines/PPO_RLLIB/train.py b/l2rpn_baselines/PPO_RLLIB/train.py
index 280a0ac..146db81 100644
--- a/l2rpn_baselines/PPO_RLLIB/train.py
+++ b/l2rpn_baselines/PPO_RLLIB/train.py
@@ -183,7 +183,8 @@ def train(env,
             
     # save the attributes kept
     act_attr_to_keep = remove_non_usable_attr(env, act_attr_to_keep)
-    need_saving = save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
+    need_saving_final = save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
+    need_saving = need_saving_final and save_every_xxx_steps is not None
     
     if env_kwargs is None:
         env_kwargs = {}
@@ -195,14 +196,15 @@ def train(env,
                   "act_attr_to_keep": act_attr_to_keep, 
                   **env_kwargs}
     
+    model_dict = {}
+    if net_arch is not None:
+        model_dict["fcnet_hiddens"] = net_arch
     env_config_ppo = {
         # config to pass to env class
         "env_config": env_config,
         #neural network config
         "lr": learning_rate,
-        "model": {
-            "fcnet_hiddens": net_arch,
-        },
+        "model": model_dict,
         **kwargs
     }
         
@@ -235,7 +237,7 @@ def train(env,
         if need_saving and step % save_every_xxx_steps == 0:
             agent.nn_model.save(checkpoint_dir=path_expe)
             
-    if need_saving:
+    if need_saving_final:
         agent.nn_model.save(checkpoint_dir=path_expe)
         
     return agent
diff --git a/l2rpn_baselines/test/test_import.py b/l2rpn_baselines/test/test_import.py
index 7debb31..78a89be 100644
--- a/l2rpn_baselines/test/test_import.py
+++ b/l2rpn_baselines/test/test_import.py
@@ -89,6 +89,10 @@ class TestPPOSB3(TestImport, unittest.TestCase):
     def load_module(self):
         return "PPO_SB3"
 
+class TestPPOSB3(TestImport, unittest.TestCase):
+    def load_module(self):
+        return "PPO_RLLIB"
+
 
 # because it deactivates the eager mode
 # class TestPandapowerGeirina(TestImport, unittest.TestCase):
diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py
index 334ca27..6534be8 100644
--- a/l2rpn_baselines/test/test_train_eval.py
+++ b/l2rpn_baselines/test/test_train_eval.py
@@ -75,6 +75,21 @@
     has_ExpertAgent = None
 except ImportError as exc_:
     has_ExpertAgent = exc_
+    has_SliceRDQN = exc_
+    
+try:
+    from l2rpn_baselines.PPO_RLLIB import train as train_ppo_rllib
+    from l2rpn_baselines.PPO_RLLIB import evaluate as eval_ppo_rllib
+    has_ppo_rllib = None
+except ImportError as exc_:
+    has_ppo_rllib = exc_
+    
+try:
+    from l2rpn_baselines.PPO_SB3 import train as train_ppo_sb3
+    from l2rpn_baselines.PPO_SB3 import evaluate as eval_ppo_sb3
+    has_ppo_sb3 = None
+except ImportError as exc_:
+    has_ppo_sb3 = exc_
 
 
 class TestDeepQSimple(unittest.TestCase):
@@ -646,5 +661,68 @@ def test_train_eval(self):
         assert res is not None
 
 
+class TestPPOSB3(unittest.TestCase):
+    def test_train_eval(self):
+        tmp_dir = tempfile.mkdtemp()
+        if has_ppo_sb3 is not None:
+            raise ImportError(f"PPO_SB3 is not available with error:\n{has_ppo_sb3}")
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make("l2rpn_case14_sandbox", test=True)
+            nm_ = "TestPPOSB3"
+
+            train_ppo_sb3(env,
+                       name=nm_,
+                       iterations=10,
+                       save_path=tmp_dir,
+                       load_path=None,
+                       logs_dir=tmp_dir,
+                       learning_rate=1e-4,
+                       verbose=False)
+            
+            agent, eval_res = eval_ppo_sb3(env,
+                                 load_path=tmp_dir,
+                                 name=nm_,
+                                 logs_path=tmp_dir,
+                                 nb_episode=1,
+                                 nb_process=1,
+                                 max_steps=10,
+                                 verbose=False,
+                                 save_gif=False)
+            assert eval_res is not None
+
+
+class TestPPORLLIB(unittest.TestCase):
+    def test_train_eval(self):
+        tmp_dir = tempfile.mkdtemp()
+        if has_ppo_rllib is not None:
+            raise ImportError(f"PPO_RLLIB is not available with error:\n{has_ppo_rllib}")
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make("l2rpn_case14_sandbox", test=True)
+            nm_ = "TestPPORLLIB"
+
+            train_ppo_rllib(env,
+                       name=nm_,
+                       iterations=1,
+                       save_path=tmp_dir,
+                       load_path=None,
+                       learning_rate=1e-4,
+                       verbose=False)
+            
+            agent, eval_res = eval_ppo_rllib(env,
+                                 load_path=tmp_dir,
+                                 name=nm_,
+                                 logs_path=tmp_dir,
+                                 nb_episode=1,
+                                 nb_process=1,
+                                 max_steps=10,
+                                 verbose=False,
+                                 save_gif=False)
+            assert eval_res is not None
+
+
 if __name__ == "__main__":
     unittest.main()

From 2a6aefdca07440cf0594b29b5ec0972c614285c2 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 12:18:30 +0100
Subject: [PATCH 20/56] adding some issue templates

---
 CHANGELOG.rst                      |  8 ++---
 ISSUE_TEMPLATE/bug_report.md       | 49 ++++++++++++++++++++++++++++
 ISSUE_TEMPLATE/documentation.md    | 27 ++++++++++++++++
 ISSUE_TEMPLATE/external_contrib.md | 34 ++++++++++++++++++++
 ISSUE_TEMPLATE/feature_request.md  | 51 ++++++++++++++++++++++++++++++
 5 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 ISSUE_TEMPLATE/bug_report.md
 create mode 100644 ISSUE_TEMPLATE/documentation.md
 create mode 100644 ISSUE_TEMPLATE/external_contrib.md
 create mode 100644 ISSUE_TEMPLATE/feature_request.md

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b5727b1..f9282a1 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -3,10 +3,8 @@ Change Log
 
 [TODO]
 --------
-- wirte github issue templates for normal bugs and also for contributions
 - in the "examples" folder, make some examples for possible "submissions"
   usable in the competition for PPO_SB3 and PPO_RLLIB
-- code a baseline example using ray / rllib
 - code a baseline example using mazerl
 - code a baseline using deepmind acme
 - code a baseline with a GNN somewhere
@@ -23,14 +21,16 @@ Change Log
   `[BASELINENAME]` with ... the baseline name (*eg* `from l2rpn_baselines.DoNothing import DoNothing`)
 - [FIXED] clean the documentation
 - [FIXED] some bugs (especially in the type of actions) for some agents
-- [ADDED] a code example to use stable baselines 3 (see l2rpn_baselines.ppo_stable_baselines)
+- [ADDED] a code example to use stable baselines 3 (see `l2rpn_baselines.PPO_SB3`)
+- [ADDED] a code example to use RLLIB (see `l2rpn_baselines.PPO_RLLIB`)
+- [ADDED] some issue templates
 
 [0.5.1] - 2021-04-09
 ---------------------
 - [FIXED] issue with grid2op version >= 1.2.3 for some baselines
 - [FIXED] `Issue 26 <https://github.com/rte-france/l2rpn-baselines/issues/26>`_ : package can be installed even
   if the requirement for some baselines is not met.
-- [UPDATED] Kaist baselines
+- [UPDATED] `Kaist` baselines
 - [ADDED] The expert agent
 
 [0.5.0] - 2020-08-18
diff --git a/ISSUE_TEMPLATE/bug_report.md b/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..7d75069
--- /dev/null
+++ b/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,49 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+## System information
+ - Grid2op version: `1.x.x`
+ - l2rpn-baselines version: `1.x.x`
+ - System: `windows, osx, ubuntu16.04, ...`
+ - Baseline concerned: *eg* `PPO_SB3` or `ExpertAgent`
+ - *Additional system information*
+
+## Bug description
+<!--A clear and concise description of what the bug is.-->
+
+## How to reproduce
+<!--Explain in detail how to reproduce your issue. The easier it will be for us to
+reproduce it, the faster we will be able to work on this.-->
+
+### Command line (if any)
+<!--Ideally, if we execute the following command, the bug will directly be reproduced. 
+Here put the command line we have to execute-->
+```bash
+# command line used if any 
+```
+
+### Code snippet (if any)
+<!--Expose the python code you want us to test-->
+```python
+import grid2op
+
+... # Some code 
+```
+
+## Current output
+<!--Describe the output you have-->
+```
+The output of the code snippet above
+```
+
+## Expected output
+<!--Describe the output you desire-->
+```
+The expected output and/or expected behavior description
+```
diff --git a/ISSUE_TEMPLATE/documentation.md b/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 0000000..8581b28
--- /dev/null
+++ b/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,27 @@
+---
+name: Documentation
+about: Anything related to the documentation
+title: ''
+labels: documentation
+assignees: ''
+
+---
+
+## Documentation issue description
+<!--A description of what the problem/suggestion is.-->
+
+## Suggested modifications
+<!--Be as concise and clear as possible.
+Ideally we could directly copy paste this code in grid2op documentation / notebook etc.
+-->
+
+```
+Please note:
+
+Documentation issues are low priority. 
+Please provide your suggested modifications to increase processing speed.
+Thanks for your understanding.
+```
+
+## Additional context
+<!--Add any other context here.-->
diff --git a/ISSUE_TEMPLATE/external_contrib.md b/ISSUE_TEMPLATE/external_contrib.md
new file mode 100644
index 0000000..043fd98
--- /dev/null
+++ b/ISSUE_TEMPLATE/external_contrib.md
@@ -0,0 +1,34 @@
+---
+name: External contribution
+about: Add an external contribution that many people will be able to use
+title: ''
+labels: external_contrib
+assignees: ''
+
+---
+
+## Algorithm implementation
+<!--Provide us with the code where your model is implemented-->
+
+Ex: https://github.com/blablabla/myawesomemodel.git
+
+Check your model follows the template code for l2rpn-baselines:
+
+- [ ] it is freely available on github / gitlab
+- [ ] it exports a function call exactly `evaluate` 
+- [ ] it exports a class named like your github / gitlab (for example, if it is    
+  hosted at https://github.com/YourName/MyAwesomeModel) then you need to be able to do something like `from MyAwesomeModel import MyAwesomeModel`). Once 
+  included into l2rpn-baselines, it will be imported with `from l2rpn_baselines.MyAwesomeModel import MyAwesomeModel`
+
+## License
+<!--  -->
+Detail here the license that is applicable to your code. It will also be written in the documentation.
+
+## Short description
+<!--This description will appear on the documentation-->
+
+
+## Extra references
+<!--You can link acamemic papers, experiment results with your contribution  -->
+<!--You can also say on which environment your model has been tested and / or
+on which it does not work. -->
diff --git a/ISSUE_TEMPLATE/feature_request.md b/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..aff0b0d
--- /dev/null
+++ b/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,51 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+## Is your feature request related to a problem? Please describe.
+<!--A clear and concise description of what the problem is.-->
+<!--Ex. I'm always frustrated when [...]-->
+<!-- Do not forget to include code you are currently running that is frustrating-->
+
+Ex. This is what i do:
+
+```python
+import l2rpn_baselines
+...
+
+```
+
+## Is your feature request related to a new RL agorithm? Please describe.
+<!--If you would like to see a new algorithm in this repository let us know and
+describe it as best you can.
+
+Do not hesistate to put links and anything else related that can help
+us implement it
+-->
+
+- [ ] yes
+- [ ] no
+
+## Describe the solution you'd like
+<!--A clear and concise description of what you want to happen.-->
+<!--Don't forget to include the "code of your dream" -->
+
+Ex. This is how i would like it to be done:
+```python
+import l2rpn_baselines
+...
+
+# give an example on how your awesome new feature would behave
+```
+
+## Describe alternatives you've considered
+<!--A clear and concise description of any 
+alternative solutions or features you've considered.-->
+
+## Additional context
+<!--Add any other context about the feature request here.-->

From 3d2607b8b2202282a3cbc0c741711b6ddf2dda59 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 12:25:58 +0100
Subject: [PATCH 21/56] moving the issue template at the proper location

---
 .../ISSUE_TEMPLATE}/bug_report.md             |  0
 .../ISSUE_TEMPLATE}/documentation.md          |  0
 .../ISSUE_TEMPLATE}/external_contrib.md       | 19 ++++++++++++++++---
 .../ISSUE_TEMPLATE}/feature_request.md        |  0
 setup.py                                      | 12 +++++-------
 5 files changed, 21 insertions(+), 10 deletions(-)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/bug_report.md (100%)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/documentation.md (100%)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/external_contrib.md (67%)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/feature_request.md (100%)

diff --git a/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
similarity index 100%
rename from ISSUE_TEMPLATE/bug_report.md
rename to .github/ISSUE_TEMPLATE/bug_report.md
diff --git a/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
similarity index 100%
rename from ISSUE_TEMPLATE/documentation.md
rename to .github/ISSUE_TEMPLATE/documentation.md
diff --git a/ISSUE_TEMPLATE/external_contrib.md b/.github/ISSUE_TEMPLATE/external_contrib.md
similarity index 67%
rename from ISSUE_TEMPLATE/external_contrib.md
rename to .github/ISSUE_TEMPLATE/external_contrib.md
index 043fd98..ffe77c9 100644
--- a/ISSUE_TEMPLATE/external_contrib.md
+++ b/.github/ISSUE_TEMPLATE/external_contrib.md
@@ -21,14 +21,27 @@ Check your model follows the template code for l2rpn-baselines:
   included into l2rpn-baselines, it will be imported with `from l2rpn_baselines.MyAwesomeModel import MyAwesomeModel`
 
 ## License
-<!--  -->
-Detail here the license that is applicable to your code. It will also be written in the documentation.
+<!-- Detail here the license that is applicable to your code. It will also be written in the documentation. -->
+
+ex. I adopted for the MPL v2.0 license
 
 ## Short description
 <!--This description will appear on the documentation-->
 
 
-## Extra references
+## Required packages (optional)
+<!-- Write here the required packages need for your baselines.
+
+If provided, it will be used to allow the use of your baseline easily by installing with
+`pip install l2rpn-baselines[MyAwesomeModel]`
+(eg. `pip install l2rpn-baselines[PPO_SB3]`)
+-->
+
+- grid2op version xx.yy.zz
+- numpy version xx.yy.zz
+- etc.
+
+## Extra references (optional)
 <!--You can link acamemic papers, experiment results with your contribution  -->
 <!--You can also say on which environment your model has been tested and / or
 on which it does not work. -->
diff --git a/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
similarity index 100%
rename from ISSUE_TEMPLATE/feature_request.md
rename to .github/ISSUE_TEMPLATE/feature_request.md
diff --git a/setup.py b/setup.py
index ae76aa0..2284255 100644
--- a/setup.py
+++ b/setup.py
@@ -27,18 +27,16 @@
             "sphinxcontrib-trio>=1.1.0",
             "autodocsumm>=0.1.13"
         ],
-        "challenge": ["grid2op[challenge]>=0.9.1.post1"],
-        "optional": ["grid2op[optional]>=1.2.0",
+        "optional": ["grid2op[optional]>=1.6.5",
                      "tensorflow>=2.2.0",
                      "Keras>=2.3.1",
                      "torch>=1.4.0",
                      "scikit-learn>=0.22.2",
-                     "gym>=0.17.1"
                      ],
-        "rllib": ["ray[rllib]",
-                  "jsonpickle",
-                  "lightsim2grid"],
-        "stable_baselines": ["stable_baselines3"]
+        "PPO_RLLIB": ["ray[rllib]",
+                      "jsonpickle",
+                      "lightsim2grid"],
+        "PPO_SB3": ["stable_baselines3"]
     }
 }
 

From f03d8fc503ee7b8f4aec5e385f3a4f0923e821ba Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Mon, 31 Jan 2022 17:10:50 +0100
Subject: [PATCH 22/56] trying to work on a working example

---
 .gitignore                                    |   2 +
 examples/ppo_stable_baselines/1_prep_env.py   |  52 ++++++++
 .../ppo_stable_baselines/2_train_agent.py     | 122 ++++++++++++++++++
 examples/ppo_stable_baselines/ReadMe.md       |  24 ++++
 l2rpn_baselines/PPO_SB3/evaluate.py           |   8 ++
 l2rpn_baselines/PPO_SB3/train.py              |  41 +++++-
 6 files changed, 244 insertions(+), 5 deletions(-)
 create mode 100644 examples/ppo_stable_baselines/1_prep_env.py
 create mode 100644 examples/ppo_stable_baselines/2_train_agent.py
 create mode 100644 examples/ppo_stable_baselines/ReadMe.md

diff --git a/.gitignore b/.gitignore
index a6e0dd5..414a509 100644
--- a/.gitignore
+++ b/.gitignore
@@ -180,3 +180,5 @@ l2rpn_baselines/PPO_SB3/logs/**
 l2rpn_baselines/PPO_RLLIB/logs/**
 l2rpn_baselines/PPO_RLLIB/saved_model/**
 test_jsonpickle.json
+examples/ppo_stable_baselines/saved_model/**
+examples/ppo_stable_baselines/logs/**
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/1_prep_env.py b/examples/ppo_stable_baselines/1_prep_env.py
new file mode 100644
index 0000000..9c2469c
--- /dev/null
+++ b/examples/ppo_stable_baselines/1_prep_env.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020-2022, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+# this file needs to be run only once, it might take a while !
+
+import grid2op
+from grid2op.dtypes import dt_int
+from grid2op.utils import ScoreICAPS2021
+from lightsim2grid import LightSimBackend
+import numpy as np
+
+
+env_name = "l2rpn_icaps_2021_small"
+nb_process_stats = 8
+verbose = 1
+
+# create the environment
+env = grid2op.make(env_name)
+
+# split into train / val / test
+# it is such that there are 25 chronics for val and 24 for test
+env.seed(1)
+env.reset()
+nm_train, nm_test, nm_val = env.train_val_split_random(add_for_test="test",
+                                                       pct_val=4.2,
+                                                       pct_test=4.2)
+
+# computes some statistics for val / test to compare performance of 
+# some agents with the do nothing for example
+max_int = max_int = np.iinfo(dt_int).max
+for nm_ in [nm_val, nm_test]:
+    env_tmp = grid2op.make(nm_, backend=LightSimBackend())
+    nb_scenario = len(env_tmp.chronics_handler.subpaths)
+    print(f"{nm_}: {nb_scenario}")
+    my_score = ScoreICAPS2021(env_tmp,
+                              nb_scenario=nb_scenario,
+                              env_seeds=np.random.randint(low=0,
+                                                          high=max_int,
+                                                          size=nb_scenario,
+                                                          dtype=dt_int),
+                              agent_seeds=[0 for _ in range(nb_scenario)],
+                              verbose=verbose,
+                              nb_process_stats=nb_process_stats,
+                              )
+
+# my_agent = DoNothingAgent(env.action_space)
+# print(my_score.get(my_agent))
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
new file mode 100644
index 0000000..88dc8a6
--- /dev/null
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020-2022, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+# this file is likely to be run multiple times. You might also want
+# to customize the reward, the attributes of the observation
+# you want to keep etc.
+# Remember this is an example, that should perform relatively well (better than
+# do nothing)
+
+import re
+import numpy as np
+from grid2op.Reward import BaseReward
+
+env_name = "l2rpn_icaps_2021_small_train"
+
+
+# customize the reward function (optional)
+class CustomReward(BaseReward):
+    def __init__(self):
+        """
+        Initializes :attr:`BaseReward.reward_min` and :attr:`BaseReward.reward_max`
+
+        """
+        self.reward_min = 0.
+        self.reward_max = 1.
+        self._max_redisp = None
+        self._min_rho = 0.90
+        self._max_rho = 2.0
+    
+    def initialize(self, env):
+        self._max_redisp = np.maximum(env.gen_pmax - env.gen_pmin, 0.)
+        self._max_redisp += 1
+        self._1_max_redisp = 1.0 / self._max_redisp / env.n_gen
+        self._is_renew_ = env.gen_renewable
+        
+        
+    def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
+        if is_done:
+            print(f"{env.nb_time_step = }")
+            # episode is over => 2 cases
+            # if env.nb_time_step == env.max_episode_duration():
+            #     return self.reward_max
+            # else:
+            #     return self.reward_min
+            return env.nb_time_step / env.max_episode_duration()
+        if is_illegal or is_ambiguous or has_error:
+            return self.reward_min
+
+        # penalize the dispatch
+        obs = env.get_obs()
+        score_redisp = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
+        
+        # penalize the curtailment
+        score_curtail = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        score_action = 0.5 * (np.sqrt(score_redisp) + np.sqrt(score_curtail))
+        
+        # score the "state" of the grid
+        tmp_state = np.minimum(np.maximum(obs.rho, self._min_rho), self._max_rho)
+        tmp_state -= self._min_rho
+        tmp_state /= (self._max_rho - self._min_rho) * env.n_line
+        score_state = np.sqrt(np.sqrt(np.sum(tmp_state)))
+
+        # score close to goal
+        score_goal = env.nb_time_step / env.max_episode_duration()
+        
+        res = score_goal * (1.0 - 0.5 * (score_action + score_state))
+        return score_goal * res
+    
+    
+if __name__ == "__main__":
+    
+    import grid2op
+    from l2rpn_baselines.PPO_SB3 import train
+    from lightsim2grid import LightSimBackend  # highly recommended !
+    from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
+
+    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour",
+                        "gen_p", "load_p",
+                        "actual_dispatch", "target_dispatch",
+                        "rho", "timestep_overflow", "line_status",
+                        "curtailment", "gen_p_before_curtail"]
+
+    act_attr_to_keep = ["redispatch", "curtail"]
+    nb_iter = 3_000_000
+    learning_rate = 1e-3
+    net_arch = [200, 200, 200]
+    name = "expe_7"
+    
+    env = grid2op.make(env_name,
+                       reward_class=CustomReward,
+                       backend=LightSimBackend(),
+                       chronics_class=MultifolderWithCache,
+                       difficulty="0")
+
+    obs = env.reset()
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*0$", x) is not None)
+    env.chronics_handler.real_data.reset()
+    # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
+    # for more information !
+    
+    trained_agent = train(
+            env,
+            iterations=nb_iter,
+            logs_dir="./logs",
+            save_path="./saved_model", 
+            obs_attr_to_keep=obs_attr_to_keep,
+            act_attr_to_keep=act_attr_to_keep,
+            normalize_act=True,
+            normalize_obs=True,
+            name="expe_6",
+            learning_rate=learning_rate,
+            net_arch=net_arch,
+            save_every_xxx_steps=min(nb_iter // 10, 100_000),
+            verbose=1
+            )
+    
+    # TODO evaluate it !
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
new file mode 100644
index 0000000..53e3be1
--- /dev/null
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -0,0 +1,24 @@
+# Objective
+
+This repository demonstrates how to use grid2op, lightsim2grid and l2rpn-baselines to make a RL agent that is able to perform some actions on a grid2op environment using the PPO algorithm and the `stable-baselines3` rl library.
+
+It focuses on the `PPO_SB3` baseline with a strong focus on **continuous** variables (curtailment and redispatching)
+
+It will be usable on the `l2rpn_icaps_2021` grid2op environment
+
+It is organized as follow:
+
+1) you split the environment into training and validation
+2) you train the agent (do not hesitate to change the parameters there) on the
+   training set
+3) you evaluate it on a dataset not used for training !
+4) once your are "happy" with your results on step 3 (so you will probably need to
+   run step 2 and 3 multiple times...) you can submit it to a l2rpn competition
+
+## 1 Preparing the training environment
+
+## 2 Training the agent
+
+## 3 evaluate the agent
+
+## 4 preparing the submision
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index b66a513..d3589df 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -152,6 +152,14 @@ def evaluate(env,
     gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
     gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
     
+    if os.path.exists(load_path, ".normalize_act"):
+        for attr_nm in act_attr_to_keep:
+            gym_action_space.normalize_attr(attr_nm)
+
+    if os.path.exists(load_path, ".normalize_obs"):
+        for attr_nm in obs_attr_to_keep:
+            gym_observation_space.normalize_attr(attr_nm)
+            
     # create a grid2gop agent based on that (this will reload the save weights)
     full_path = os.path.join(load_path, name)
     grid2op_agent = SB3Agent(env.action_space,
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index 7e84028..af4bc29 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -51,6 +51,9 @@ def train(env,
           obs_attr_to_keep=copy.deepcopy(default_obs_attr_to_keep),
           act_attr_to_keep=copy.deepcopy(default_act_attr_to_keep),
           policy_kwargs=None,
+          normalize_obs=False,
+          normalize_act=False,
+          eval_env=None,  # TODO
           **kwargs):
     """
     This function will use stable baselines 3 to train a PPO agent on
@@ -117,10 +120,18 @@ def train(env,
     verbose: ``bool``
         If you want something to be printed on the terminal (a better logging strategy will be put at some point)
 
+    normalize_obs: ``bool``
+        Attempt to normalize the observation space (so that gym-based stuff will only
+        see numbers between 0 and 1)
+    
+    normalize_act: ``bool``
+        Attempt to normalize the action space (so that gym-based stuff will only
+        manipulate numbers between 0 and 1)
+        
     policy_kwargs: ``dict``
         extra parameters passed to the PPO "policy_kwargs" key word arguments
         (defaults to ``None``)
-        
+    
     kwargs:
         extra parameters passed to the PPO from stable baselines 3
 
@@ -194,8 +205,23 @@ def train(env,
     env_gym.action_space = BoxGymActSpace(env.action_space,
                                           attr_to_keep=act_attr_to_keep)
 
-
-    # Save a checkpoint every 1000 steps
+    if normalize_act:
+        if save_path is not None:
+            with open(os.path.join(my_path, ".normalize_act"), encoding="utf-8", 
+                      mode="w") as f:
+                f.write("I have encoded the action space !\n DO NOT MODIFY !")
+        for attr_nm in act_attr_to_keep:
+            env_gym.action_space.normalize_attr(attr_nm)
+
+    if normalize_obs:
+        if save_path is not None:
+            with open(os.path.join(my_path, ".normalize_obs"), encoding="utf-8", 
+                      mode="w") as f:
+                f.write("I have encoded the observation space !\n DO NOT MODIFY !")
+        for attr_nm in obs_attr_to_keep:
+            env_gym.observation_space.normalize_attr(attr_nm)
+    
+    # Save a checkpoint every "save_every_xxx_steps" steps
     checkpoint_callback = None
     if save_every_xxx_steps is not None:
         if save_path is None:
@@ -216,13 +242,16 @@ def train(env,
         if logs_dir is not None:
             if not os.path.exists(logs_dir):
                 os.mkdir(logs_dir)
+            this_logs_dir = os.path.join(logs_dir, name)
+        else:
+            this_logs_dir = None
                 
         nn_kwargs = {
             "policy": model_policy,
             "env": env_gym,
             "verbose": verbose,
             "learning_rate": learning_rate,
-            "tensorboard_log": logs_dir,
+            "tensorboard_log": this_logs_dir,
             "policy_kwargs": policy_kwargs,
             **kwargs
         }
@@ -240,7 +269,9 @@ def train(env,
 
     # train it
     agent.nn_model.learn(total_timesteps=iterations,
-                         callback=checkpoint_callback)
+                         callback=checkpoint_callback,
+                         # eval_env=eval_env  # TODO
+                         )
 
     # save it
     if save_path is not None:

From 8708bea5809c32771b59e95e9bc057440e5e26cf Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 1 Feb 2022 18:40:08 +0100
Subject: [PATCH 23/56] improving the scripts to train the ppo_sb3 baselines
 after some usage

---
 CHANGELOG.rst                                 |  2 +
 .../ppo_stable_baselines/2_train_agent.py     | 40 +++++---
 .../3_evaluate_trained_model.py               | 95 +++++++++++++++++++
 l2rpn_baselines/PPO_SB3/evaluate.py           | 10 +-
 l2rpn_baselines/PPO_SB3/train.py              |  1 +
 l2rpn_baselines/PPO_SB3/utils.py              | 15 ++-
 6 files changed, 143 insertions(+), 20 deletions(-)
 create mode 100644 examples/ppo_stable_baselines/3_evaluate_trained_model.py

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f9282a1..74b1b2b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,8 @@ Change Log
 --------
 - in the "examples" folder, make some examples for possible "submissions"
   usable in the competition for PPO_SB3 and PPO_RLLIB
+- add a vectorized environment for PPO in stable baselines for example
+  (ie map a MultiEnvironment into the proper stuff)
 - code a baseline example using mazerl
 - code a baseline using deepmind acme
 - code a baseline with a GNN somewhere
diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index 88dc8a6..f3cea62 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -17,7 +17,7 @@
 from grid2op.Reward import BaseReward
 
 env_name = "l2rpn_icaps_2021_small_train"
-
+save_path = "./saved_model"
 
 # customize the reward function (optional)
 class CustomReward(BaseReward):
@@ -37,7 +37,8 @@ def initialize(self, env):
         self._max_redisp += 1
         self._1_max_redisp = 1.0 / self._max_redisp / env.n_gen
         self._is_renew_ = env.gen_renewable
-        
+        self._1_max_redisp_act = np.maximum(np.maximum(env.gen_max_ramp_up, env.gen_max_ramp_down), 1.0)
+        self._1_max_redisp_act = 1.0 / self._1_max_redisp_act / np.sum(env.gen_redispatchable)
         
     def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         if is_done:
@@ -53,10 +54,16 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
 
         # penalize the dispatch
         obs = env.get_obs()
-        score_redisp = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
+        score_redisp_state = 0.
+        # score_redisp_state = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
+        score_redisp_action = np.sum(np.abs(action.redispatch) * self._1_max_redisp_act) 
+        score_redisp = 0.5 *(score_redisp_state + score_redisp_action)
         
         # penalize the curtailment
-        score_curtail = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        score_curtail = 0.
+        # score_curtail = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        
+        # rate the actions
         score_action = 0.5 * (np.sqrt(score_redisp) + np.sqrt(score_curtail))
         
         # score the "state" of the grid
@@ -66,8 +73,10 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         score_state = np.sqrt(np.sqrt(np.sum(tmp_state)))
 
         # score close to goal
-        score_goal = env.nb_time_step / env.max_episode_duration()
+        score_goal = 0.
+        # score_goal = env.nb_time_step / env.max_episode_duration()
         
+        # score too much redisp
         res = score_goal * (1.0 - 0.5 * (score_action + score_state))
         return score_goal * res
     
@@ -78,27 +87,27 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     from l2rpn_baselines.PPO_SB3 import train
     from lightsim2grid import LightSimBackend  # highly recommended !
     from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
-
+    
     obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour",
-                        "gen_p", "load_p",
+                        "gen_p", "load_p", "p_or",
                         "actual_dispatch", "target_dispatch",
                         "rho", "timestep_overflow", "line_status",
                         "curtailment", "gen_p_before_curtail"]
 
     act_attr_to_keep = ["redispatch", "curtail"]
-    nb_iter = 3_000_000
-    learning_rate = 1e-3
+    nb_iter = 300_000
+    learning_rate = 3e-3
     net_arch = [200, 200, 200]
-    name = "expe_7"
+    name = "expe_10"
     
     env = grid2op.make(env_name,
                        reward_class=CustomReward,
                        backend=LightSimBackend(),
-                       chronics_class=MultifolderWithCache,
-                       difficulty="0")
+                       chronics_class=MultifolderWithCache)
 
     obs = env.reset()
-    env.chronics_handler.real_data.set_filter(lambda x: re.match(".*0$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*0$", x) is not None)
+    env.chronics_handler.real_data.set_filter(lambda x: True)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
@@ -107,16 +116,17 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             env,
             iterations=nb_iter,
             logs_dir="./logs",
-            save_path="./saved_model", 
+            save_path=save_path, 
             obs_attr_to_keep=obs_attr_to_keep,
             act_attr_to_keep=act_attr_to_keep,
             normalize_act=True,
             normalize_obs=True,
-            name="expe_6",
+            name=name,
             learning_rate=learning_rate,
             net_arch=net_arch,
             save_every_xxx_steps=min(nb_iter // 10, 100_000),
             verbose=1
             )
     
+    print("After training, ")
     # TODO evaluate it !
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/3_evaluate_trained_model.py b/examples/ppo_stable_baselines/3_evaluate_trained_model.py
new file mode 100644
index 0000000..b94662f
--- /dev/null
+++ b/examples/ppo_stable_baselines/3_evaluate_trained_model.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020-2022, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import json
+import grid2op
+import numpy as np
+from grid2op.utils import ScoreICAPS2021
+from lightsim2grid import LightSimBackend
+from l2rpn_baselines.PPO_SB3 import evaluate
+
+
+env_name = "l2rpn_icaps_2021_small_val"
+nb_scenario = 25
+nb_process_stats = 1
+load_path = "./saved_model"
+
+
+def _aux_get_env(env_name, dn=True):
+    path_ = grid2op.get_current_local_dir()
+    path_env = os.path.join(path_, env_name)
+    if not os.path.exists(path_env):
+        raise RuntimeError(f"The environment \"{env_name}\" does not exist.")
+    
+    path_dn = os.path.join(path_env, "_statistics_icaps2021_dn")
+    if not os.path.exists(path_dn):
+        raise RuntimeError("The folder _statistics_icaps2021_dn used for computing the score do not exist")
+    path_reco = os.path.join(path_env, "_statistics_l2rpn_no_overflow_reco")
+    if not os.path.exists(path_reco):
+        raise RuntimeError("The folder _statistics_l2rpn_no_overflow_reco used for computing the score do not exist")
+    if dn:
+        path_metadata = os.path.join(path_dn, "metadata.json")
+    else:
+        path_metadata = os.path.join(path_reco, "metadata.json")
+        
+    if not os.path.exists(path_metadata):
+        raise RuntimeError("The folder _statistics_icaps2021_dn does not appear to be a score folder")
+    
+    with open(path_metadata, "r", encoding="utf-8") as f:
+        dict_ = json.load(f)
+    
+    return dict_
+
+def get_env_seed(env_name):
+    dict_ = _aux_get_env(env_name)
+    
+    key = "env_seeds"
+    if key not in dict_:
+        raise RuntimeError(f"Impossible to find the key {key} in the dictionnary. You should re run the score function.")
+    
+    return dict_[key]
+
+def load_agent(env, load_path, name):
+    trained_agent, _ = evaluate(env,
+                                nb_episode=0,
+                                load_path=load_path,
+                                name=name)
+    return trained_agent
+
+
+def get_ts_survived_dn(env_name):
+    dict_ = _aux_get_env(env_name, dn=True)
+    res = []
+    for kk in range(nb_scenario):
+        tmp_ = dict_[f"{kk}"]["nb_step"]
+        res.append(tmp_)
+    return res
+
+
+if __name__ == "__main__":
+    name = "expe_8"
+    
+    #
+    env_val = grid2op.make(env_name, backend=LightSimBackend())
+    my_score = ScoreICAPS2021(env_val,
+                              nb_scenario=nb_scenario,
+                              env_seeds=get_env_seed(env_name),
+                              agent_seeds=[0 for _ in range(nb_scenario)],
+                              verbose=False,
+                              nb_process_stats=nb_process_stats,
+                              )
+
+    my_agent = load_agent(env_val, load_path=load_path, name=name)
+    _, ts_survived, _ = my_score.get(my_agent)
+    dn_ts_survived = get_ts_survived_dn(env_name)
+    best_than_dn = 0
+    for my_ts, dn_ts in zip(ts_survived, dn_ts_survived):
+        print(f"I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})")
+        best_than_dn += my_ts >= dn_ts
+    print(f"The agent \"{name}\" beats do nothing in {best_than_dn} out of {len(dn_ts_survived)} episodes")
\ No newline at end of file
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index d3589df..10038f2 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -152,11 +152,11 @@ def evaluate(env,
     gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
     gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
     
-    if os.path.exists(load_path, ".normalize_act"):
+    if os.path.exists(os.path.join(load_path, ".normalize_act")):
         for attr_nm in act_attr_to_keep:
             gym_action_space.normalize_attr(attr_nm)
 
-    if os.path.exists(load_path, ".normalize_obs"):
+    if os.path.exists(os.path.join(load_path, ".normalize_obs")):
         for attr_nm in obs_attr_to_keep:
             gym_observation_space.normalize_attr(attr_nm)
             
@@ -165,8 +165,12 @@ def evaluate(env,
     grid2op_agent = SB3Agent(env.action_space,
                              gym_action_space,
                              gym_observation_space,
-                             nn_path=os.path.join(full_path, name))
+                             nn_path=os.path.join(full_path, name)
+                             )
 
+    if nb_episode == 0:
+        return grid2op_agent, []
+    
     # Build runner
     runner_params = env.get_params_for_runner()
     runner_params["verbose"] = verbose
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index af4bc29..e14b503 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -53,6 +53,7 @@ def train(env,
           policy_kwargs=None,
           normalize_obs=False,
           normalize_act=False,
+          seed=None,  # TODO
           eval_env=None,  # TODO
           **kwargs):
     """
diff --git a/l2rpn_baselines/PPO_SB3/utils.py b/l2rpn_baselines/PPO_SB3/utils.py
index 6f1b225..0a1284c 100644
--- a/l2rpn_baselines/PPO_SB3/utils.py
+++ b/l2rpn_baselines/PPO_SB3/utils.py
@@ -207,8 +207,14 @@ def __init__(self,
                  gym_obs_space,
                  nn_type=PPO,
                  nn_path=None,
-                 nn_kwargs=None):
+                 nn_kwargs=None,
+                 custom_load_dict=None,
+                 ):
         self._nn_type = nn_type
+        if custom_load_dict is not None:
+            self.custom_load_dict = custom_load_dict
+        else:
+            self.custom_load_dict = {}
         super().__init__(g2op_action_space, gym_act_space, gym_obs_space,
                          nn_path=nn_path, nn_kwargs=nn_kwargs)
         
@@ -244,7 +250,12 @@ def load(self):
             
             PPO.load(nn_path)
         """
-        self.nn_model = self._nn_type.load(self._nn_path)
+        custom_objects = {"action_space": self._gym_act_space,
+                          "observation_space": self._gym_obs_space}
+        for key, val in self.custom_load_dict.items():
+            custom_objects[key] = val
+        self.nn_model = self._nn_type.load(self._nn_path,
+                                           custom_objects=custom_objects)
         
     def build(self):
         """Create the underlying NN model from scratch.

From 4c34120d675fb436ceefe3bd66eb8f1c5d4aef03 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 1 Feb 2022 18:52:12 +0100
Subject: [PATCH 24/56] starting to train an agent

---
 .../ppo_stable_baselines/2_train_agent.py     | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index f3cea62..2ade9fd 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -39,6 +39,7 @@ def initialize(self, env):
         self._is_renew_ = env.gen_renewable
         self._1_max_redisp_act = np.maximum(np.maximum(env.gen_max_ramp_up, env.gen_max_ramp_down), 1.0)
         self._1_max_redisp_act = 1.0 / self._1_max_redisp_act / np.sum(env.gen_redispatchable)
+        self._nb_renew = np.sum(self._is_renew_)
         
     def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         if is_done:
@@ -60,21 +61,26 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         score_redisp = 0.5 *(score_redisp_state + score_redisp_action)
         
         # penalize the curtailment
-        score_curtail = 0.
-        # score_curtail = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        score_curtail_state = 0.
+        # score_curtail_state = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        curt_act = action.curtail
+        score_curtail_action = np.sum(curt_act[curt_act != -1.0]) / self._nb_renew 
+        score_curtail = 0.5 *(score_curtail_state + score_curtail_action)
         
         # rate the actions
         score_action = 0.5 * (np.sqrt(score_redisp) + np.sqrt(score_curtail))
         
         # score the "state" of the grid
-        tmp_state = np.minimum(np.maximum(obs.rho, self._min_rho), self._max_rho)
-        tmp_state -= self._min_rho
-        tmp_state /= (self._max_rho - self._min_rho) * env.n_line
-        score_state = np.sqrt(np.sqrt(np.sum(tmp_state)))
+        # tmp_state = np.minimum(np.maximum(obs.rho, self._min_rho), self._max_rho)
+        # tmp_state -= self._min_rho
+        # tmp_state /= (self._max_rho - self._min_rho) * env.n_line
+        # score_state = np.sqrt(np.sqrt(np.sum(tmp_state)))
+        score_state = 0.
 
         # score close to goal
-        score_goal = 0.
+        # score_goal = 0.
         # score_goal = env.nb_time_step / env.max_episode_duration()
+        score_goal = 1.0
         
         # score too much redisp
         res = score_goal * (1.0 - 0.5 * (score_action + score_state))
@@ -95,10 +101,11 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                         "curtailment", "gen_p_before_curtail"]
 
     act_attr_to_keep = ["redispatch", "curtail"]
-    nb_iter = 300_000
+    nb_iter = 6_000_000
     learning_rate = 3e-3
-    net_arch = [200, 200, 200]
-    name = "expe_10"
+    net_arch = [300, 300, 300, 300]
+    name = "expe_0"
+    gamma = 0.999
     
     env = grid2op.make(env_name,
                        reward_class=CustomReward,
@@ -125,7 +132,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             learning_rate=learning_rate,
             net_arch=net_arch,
             save_every_xxx_steps=min(nb_iter // 10, 100_000),
-            verbose=1
+            verbose=1,
+            gamma=0.999,
             )
     
     print("After training, ")

From d09e7b25ef4937e26f9eb79b90fe61786bed286e Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 1 Feb 2022 19:08:19 +0100
Subject: [PATCH 25/56] starting to train an agent

---
 examples/ppo_stable_baselines/2_train_agent.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index 2ade9fd..cb56ddd 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -12,6 +12,7 @@
 # Remember this is an example, that should perform relatively well (better than
 # do nothing)
 
+import os
 import re
 import numpy as np
 from grid2op.Reward import BaseReward
@@ -43,7 +44,7 @@ def initialize(self, env):
         
     def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         if is_done:
-            print(f"{env.nb_time_step = }")
+            print(f"{os.path.split(env.chronics_handler.get_id())[-1]}: {env.nb_time_step = }")
             # episode is over => 2 cases
             # if env.nb_time_step == env.max_episode_duration():
             #     return self.reward_max
@@ -52,7 +53,6 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             return env.nb_time_step / env.max_episode_duration()
         if is_illegal or is_ambiguous or has_error:
             return self.reward_min
-
         # penalize the dispatch
         obs = env.get_obs()
         score_redisp_state = 0.
@@ -113,12 +113,12 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        chronics_class=MultifolderWithCache)
 
     obs = env.reset()
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*0$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
     env.chronics_handler.real_data.set_filter(lambda x: True)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
-    
+    print("environment loaded !")
     trained_agent = train(
             env,
             iterations=nb_iter,

From a676a5bfccad7149d716ca0da5d7e62ee76f451b Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Wed, 2 Feb 2022 10:56:43 +0100
Subject: [PATCH 26/56] continuing to add features for training DRL agents

---
 .../ppo_stable_baselines/2_train_agent.py     | 22 +++---
 .../3_evaluate_trained_model.py               |  6 +-
 l2rpn_baselines/PPO_SB3/train.py              | 12 ++-
 l2rpn_baselines/utils/__init__.py             |  5 +-
 l2rpn_baselines/utils/gymenv_custom.py        | 78 +++++++++++++++++++
 5 files changed, 108 insertions(+), 15 deletions(-)
 create mode 100644 l2rpn_baselines/utils/gymenv_custom.py

diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index cb56ddd..fcfcc3b 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -55,14 +55,14 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             return self.reward_min
         # penalize the dispatch
         obs = env.get_obs()
-        score_redisp_state = 0.
-        # score_redisp_state = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
+        # score_redisp_state = 0.
+        score_redisp_state = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
         score_redisp_action = np.sum(np.abs(action.redispatch) * self._1_max_redisp_act) 
         score_redisp = 0.5 *(score_redisp_state + score_redisp_action)
         
         # penalize the curtailment
-        score_curtail_state = 0.
-        # score_curtail_state = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        # score_curtail_state = 0.
+        score_curtail_state = np.sum(obs.curtailment_mw * self._1_max_redisp)
         curt_act = action.curtail
         score_curtail_action = np.sum(curt_act[curt_act != -1.0]) / self._nb_renew 
         score_curtail = 0.5 *(score_curtail_state + score_curtail_action)
@@ -79,8 +79,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
 
         # score close to goal
         # score_goal = 0.
-        # score_goal = env.nb_time_step / env.max_episode_duration()
-        score_goal = 1.0
+        score_goal = env.nb_time_step / env.max_episode_duration()
+        # score_goal = 1.0
         
         # score too much redisp
         res = score_goal * (1.0 - 0.5 * (score_action + score_state))
@@ -93,6 +93,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     from l2rpn_baselines.PPO_SB3 import train
     from lightsim2grid import LightSimBackend  # highly recommended !
     from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
+    from l2rpn_baselines.utils import GymEnvWithReco
     
     obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour",
                         "gen_p", "load_p", "p_or",
@@ -103,8 +104,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     act_attr_to_keep = ["redispatch", "curtail"]
     nb_iter = 6_000_000
     learning_rate = 3e-3
-    net_arch = [300, 300, 300, 300]
-    name = "expe_0"
+    net_arch = [300, 300, 300]
+    name = "expe_with_auto_reco"
     gamma = 0.999
     
     env = grid2op.make(env_name,
@@ -113,8 +114,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        chronics_class=MultifolderWithCache)
 
     obs = env.reset()
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
-    env.chronics_handler.real_data.set_filter(lambda x: True)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: True)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
@@ -134,6 +135,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             save_every_xxx_steps=min(nb_iter // 10, 100_000),
             verbose=1,
             gamma=0.999,
+            gymenv_class=GymEnvWithReco,
             )
     
     print("After training, ")
diff --git a/examples/ppo_stable_baselines/3_evaluate_trained_model.py b/examples/ppo_stable_baselines/3_evaluate_trained_model.py
index b94662f..8bfd4ae 100644
--- a/examples/ppo_stable_baselines/3_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/3_evaluate_trained_model.py
@@ -73,7 +73,7 @@ def get_ts_survived_dn(env_name):
 
 
 if __name__ == "__main__":
-    name = "expe_8"
+    name = "expe_0"
     
     #
     env_val = grid2op.make(env_name, backend=LightSimBackend())
@@ -90,6 +90,6 @@ def get_ts_survived_dn(env_name):
     dn_ts_survived = get_ts_survived_dn(env_name)
     best_than_dn = 0
     for my_ts, dn_ts in zip(ts_survived, dn_ts_survived):
-        print(f"I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})")
+        print(f"\t{':-)' if my_ts >= dn_ts else ':-('} I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})")
         best_than_dn += my_ts >= dn_ts
-    print(f"The agent \"{name}\" beats do nothing in {best_than_dn} out of {len(dn_ts_survived)} episodes")
\ No newline at end of file
+    print(f"The agent \"{name}\" beats do nothing in {best_than_dn} out of {len(dn_ts_survived)} episodes")
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index e14b503..69b3b80 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -53,6 +53,8 @@ def train(env,
           policy_kwargs=None,
           normalize_obs=False,
           normalize_act=False,
+          gymenv_class=GymEnv,
+          gymenv_kwargs=None,
           seed=None,  # TODO
           eval_env=None,  # TODO
           **kwargs):
@@ -128,6 +130,12 @@ def train(env,
     normalize_act: ``bool``
         Attempt to normalize the action space (so that gym-based stuff will only
         manipulate numbers between 0 and 1)
+    
+    gymenv_class: 
+        The class to use as a gym environment.
+    
+    gymenv_kwargs: ``dict``
+        Extra key words arguments to build the gym environment.
         
     policy_kwargs: ``dict``
         extra parameters passed to the PPO "policy_kwargs" key word arguments
@@ -198,7 +206,9 @@ def train(env,
     save_used_attribute(save_path, name, obs_attr_to_keep, act_attr_to_keep)
 
     # define the gym environment from the grid2op env
-    env_gym = GymEnv(env)
+    if gymenv_kwargs is None:
+        gymenv_kwargs = {}
+    env_gym = gymenv_class(env, **gymenv_kwargs)
     env_gym.observation_space.close()
     env_gym.observation_space = BoxGymObsSpace(env.observation_space,
                                                attr_to_keep=obs_attr_to_keep)
diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index 4f6ee00..6419b2d 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -19,7 +19,9 @@
     "ReplayBuffer",
     "BaseDeepQ",
     "DeepQAgent",
-    "GymAgent"
+    "GymAgent",
+    "GymEnvWithReco",
+    "GymEnvWithHeuristics"
 ]
 
 from l2rpn_baselines.utils.cli_eval import cli_eval
@@ -35,3 +37,4 @@
 from l2rpn_baselines.utils.baseDeepQ import BaseDeepQ
 from l2rpn_baselines.utils.deepQAgent import DeepQAgent
 from l2rpn_baselines.utils.gymAgent import GymAgent
+from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics, GymEnvWithReco
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
new file mode 100644
index 0000000..34c1db5
--- /dev/null
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+from abc import abstractmethod
+from typing import Tuple, Dict, List
+import numpy as np
+
+from grid2op.Observation import BaseObservation
+from grid2op.Action import BaseAction
+from grid2op.gym_compat import GymEnv
+
+class GymEnvWithHeuristics(GymEnv):
+    @abstractmethod
+    def heuristic_actions(self,
+                          g2op_obs: BaseObservation,
+                          reward: float,
+                          done: bool,
+                          info: Dict) -> List[BaseAction]:
+        return g2op_obs, reward, done, info
+    
+    def apply_heuristics_actions(self,
+                                 g2op_obs: BaseObservation,
+                                 reward: float,
+                                 done: bool,
+                                 info: Dict ) -> Tuple[BaseObservation, float, bool, Dict]:
+        g2op_actions = self.heuristic_actions(g2op_obs, reward, done, info)
+        for g2op_act in g2op_actions:
+            tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
+            print(f"I did an automatic action: {g2op_act}")
+            g2op_obs = tmp_obs
+            done = tmp_done
+            if tmp_done:
+                break
+        return g2op_obs, reward, done, info
+    
+    def step(self, gym_action):
+        """[summary]
+
+        Parameters
+        ----------
+        gym_action : [type]
+            [description]
+
+        Returns
+        -------
+        [type]
+            [description]
+        """
+        g2op_act = self.action_space.from_gym(gym_action)
+        g2op_obs, reward, done, info = self.init_env.step(g2op_act)
+        if not done:
+            g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
+        gym_obs = self.observation_space.to_gym(g2op_obs)
+        return gym_obs, float(reward), done, info
+        
+    
+class GymEnvWithReco(GymEnvWithHeuristics):
+    """[summary]
+
+    Parameters
+    ----------
+    GymEnv : [type]
+        [description]
+    """
+    def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
+        to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
+        res = []
+        if np.any(to_reco):
+            reco_id = np.where(to_reco)[0]
+            for line_id in reco_id:
+                g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
+                res.append(g2op_act)
+        return res

From d47e1461b3ff86a0f0f3db44bd1fe20e413206eb Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Wed, 2 Feb 2022 21:49:43 +0100
Subject: [PATCH 27/56] launching another expe

---
 .../ppo_stable_baselines/2_train_agent.py     | 28 +++++++++++--------
 l2rpn_baselines/utils/gymenv_custom.py        |  8 +++++-
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index fcfcc3b..5ba6e43 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -29,9 +29,15 @@ def __init__(self):
         """
         self.reward_min = 0.
         self.reward_max = 1.
-        self._max_redisp = None
         self._min_rho = 0.90
         self._max_rho = 2.0
+        
+        # parameters init with the environment
+        self._max_redisp = None
+        self._1_max_redisp = None
+        self._is_renew_ = None
+        self._1_max_redisp_act = None
+        self._nb_renew = None
     
     def initialize(self, env):
         self._max_redisp = np.maximum(env.gen_pmax - env.gen_pmin, 0.)
@@ -55,17 +61,17 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             return self.reward_min
         # penalize the dispatch
         obs = env.get_obs()
-        # score_redisp_state = 0.
-        score_redisp_state = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
+        score_redisp_state = 0.
+        # score_redisp_state = np.sum(np.abs(obs.target_dispatch) * self._1_max_redisp)
         score_redisp_action = np.sum(np.abs(action.redispatch) * self._1_max_redisp_act) 
         score_redisp = 0.5 *(score_redisp_state + score_redisp_action)
         
         # penalize the curtailment
-        # score_curtail_state = 0.
-        score_curtail_state = np.sum(obs.curtailment_mw * self._1_max_redisp)
+        score_curtail_state = 0.
+        # score_curtail_state = np.sum(obs.curtailment_mw * self._1_max_redisp)
         curt_act = action.curtail
         score_curtail_action = np.sum(curt_act[curt_act != -1.0]) / self._nb_renew 
-        score_curtail = 0.5 *(score_curtail_state + score_curtail_action)
+        score_curtail = 0.5 * (score_curtail_state + score_curtail_action)
         
         # rate the actions
         score_action = 0.5 * (np.sqrt(score_redisp) + np.sqrt(score_curtail))
@@ -79,8 +85,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
 
         # score close to goal
         # score_goal = 0.
-        score_goal = env.nb_time_step / env.max_episode_duration()
-        # score_goal = 1.0
+        # score_goal = env.nb_time_step / env.max_episode_duration()
+        score_goal = 1.0
         
         # score too much redisp
         res = score_goal * (1.0 - 0.5 * (score_action + score_state))
@@ -105,7 +111,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     nb_iter = 6_000_000
     learning_rate = 3e-3
     net_arch = [300, 300, 300]
-    name = "expe_with_auto_reco"
+    name = "expe_with_auto_reco_simplereward"
     gamma = 0.999
     
     env = grid2op.make(env_name,
@@ -114,8 +120,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        chronics_class=MultifolderWithCache)
 
     obs = env.reset()
-    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
-    # env.chronics_handler.real_data.set_filter(lambda x: True)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
+    env.chronics_handler.real_data.set_filter(lambda x: True)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index 34c1db5..ecb2d56 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -15,6 +15,13 @@
 from grid2op.gym_compat import GymEnv
 
 class GymEnvWithHeuristics(GymEnv):
+    """This abstract class is used to perform some actions, independantly of a RL
+    agent on a grid2op environment.
+    
+    It can be used, for example, to train an agent (for example a deep-rl agent)
+    if you want to use some heuristics at inference time (for example
+    you reconnect every powerline that you can.)
+    """
     @abstractmethod
     def heuristic_actions(self,
                           g2op_obs: BaseObservation,
@@ -31,7 +38,6 @@ def apply_heuristics_actions(self,
         g2op_actions = self.heuristic_actions(g2op_obs, reward, done, info)
         for g2op_act in g2op_actions:
             tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
-            print(f"I did an automatic action: {g2op_act}")
             g2op_obs = tmp_obs
             done = tmp_done
             if tmp_done:

From 9ae6f0c2f53b0991550343aed94a220e153106d1 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Thu, 3 Feb 2022 11:53:48 +0100
Subject: [PATCH 28/56] rename the files in the examples

---
 .gitignore                                    |   6 +-
 examples/ppo_stable_baselines/1_prep_env.py   |  52 --------
 examples/ppo_stable_baselines/A_prep_env.py   | 124 ++++++++++++++++++
 .../{2_train_agent.py => B_train_agent.py}    |  13 +-
 ...d_model.py => C_evaluate_trained_model.py} |  77 +++++------
 examples/ppo_stable_baselines/ReadMe.md       |  17 +++
 l2rpn_baselines/PPO_SB3/evaluate.py           |  30 ++++-
 l2rpn_baselines/PPO_SB3/train.py              |   2 +-
 l2rpn_baselines/PPO_SB3/utils.py              |   7 +-
 l2rpn_baselines/utils/gymAgent.py             |  55 +++++++-
 l2rpn_baselines/utils/gymenv_custom.py        |   3 +-
 11 files changed, 272 insertions(+), 114 deletions(-)
 delete mode 100644 examples/ppo_stable_baselines/1_prep_env.py
 create mode 100644 examples/ppo_stable_baselines/A_prep_env.py
 rename examples/ppo_stable_baselines/{2_train_agent.py => B_train_agent.py} (94%)
 rename examples/ppo_stable_baselines/{3_evaluate_trained_model.py => C_evaluate_trained_model.py} (53%)

diff --git a/.gitignore b/.gitignore
index 414a509..5e218f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,4 +181,8 @@ l2rpn_baselines/PPO_RLLIB/logs/**
 l2rpn_baselines/PPO_RLLIB/saved_model/**
 test_jsonpickle.json
 examples/ppo_stable_baselines/saved_model/**
-examples/ppo_stable_baselines/logs/**
\ No newline at end of file
+examples/ppo_stable_baselines/logs/**
+.vscode/
+logs/
+saved_agents/
+test_train_DuelQSimple.py
diff --git a/examples/ppo_stable_baselines/1_prep_env.py b/examples/ppo_stable_baselines/1_prep_env.py
deleted file mode 100644
index 9c2469c..0000000
--- a/examples/ppo_stable_baselines/1_prep_env.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020-2022, RTE (https://www.rte-france.com)
-# See AUTHORS.txt
-# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
-# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
-# you can obtain one at http://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
-
-# this file needs to be run only once, it might take a while !
-
-import grid2op
-from grid2op.dtypes import dt_int
-from grid2op.utils import ScoreICAPS2021
-from lightsim2grid import LightSimBackend
-import numpy as np
-
-
-env_name = "l2rpn_icaps_2021_small"
-nb_process_stats = 8
-verbose = 1
-
-# create the environment
-env = grid2op.make(env_name)
-
-# split into train / val / test
-# it is such that there are 25 chronics for val and 24 for test
-env.seed(1)
-env.reset()
-nm_train, nm_test, nm_val = env.train_val_split_random(add_for_test="test",
-                                                       pct_val=4.2,
-                                                       pct_test=4.2)
-
-# computes some statistics for val / test to compare performance of 
-# some agents with the do nothing for example
-max_int = max_int = np.iinfo(dt_int).max
-for nm_ in [nm_val, nm_test]:
-    env_tmp = grid2op.make(nm_, backend=LightSimBackend())
-    nb_scenario = len(env_tmp.chronics_handler.subpaths)
-    print(f"{nm_}: {nb_scenario}")
-    my_score = ScoreICAPS2021(env_tmp,
-                              nb_scenario=nb_scenario,
-                              env_seeds=np.random.randint(low=0,
-                                                          high=max_int,
-                                                          size=nb_scenario,
-                                                          dtype=dt_int),
-                              agent_seeds=[0 for _ in range(nb_scenario)],
-                              verbose=verbose,
-                              nb_process_stats=nb_process_stats,
-                              )
-
-# my_agent = DoNothingAgent(env.action_space)
-# print(my_score.get(my_agent))
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
new file mode 100644
index 0000000..2d1c8ee
--- /dev/null
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020-2022, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+# this file needs to be run only once, it might take a while !
+import os
+import json
+import numpy as np
+import grid2op
+from grid2op.dtypes import dt_int
+from grid2op.Agent import RecoPowerlineAgent
+from grid2op.utils import ScoreICAPS2021, EpisodeStatistics
+from lightsim2grid import LightSimBackend
+import numpy as np
+
+
+env_name = "l2rpn_icaps_2021_small"
+name_stats = "_reco_powerline"
+nb_process_stats = 8
+verbose = 1
+
+def _aux_get_env(env_name, dn=True, name_stat=None):
+    path_ = grid2op.get_current_local_dir()
+    path_env = os.path.join(path_, env_name)
+    if not os.path.exists(path_env):
+        raise RuntimeError(f"The environment \"{env_name}\" does not exist.")
+    
+    path_dn = os.path.join(path_env, "_statistics_icaps2021_dn")
+    if not os.path.exists(path_dn):
+        raise RuntimeError("The folder _statistics_icaps2021_dn used for computing the score do not exist")
+    path_reco = os.path.join(path_env, "_statistics_l2rpn_no_overflow_reco")
+    if not os.path.exists(path_reco):
+        raise RuntimeError("The folder _statistics_l2rpn_no_overflow_reco used for computing the score do not exist")
+    
+    if name_stat is None:
+        if dn:
+            path_metadata = os.path.join(path_dn, "metadata.json")
+        else:
+            path_metadata = os.path.join(path_reco, "metadata.json")
+    else:
+        path_stat = os.path.join(path_env, EpisodeStatistics.get_name_dir(name_stat))
+        if not os.path.exists(path_stat):
+            raise RuntimeError(f"No folder associated with statistics {name_stat}")
+        import pdb
+        pdb.set_trace()
+        path_metadata = os.path.join(path_stat, "metadata.json")
+    
+    if not os.path.exists(path_metadata):
+        raise RuntimeError("No metadata can be found for the statistics you wanted to compute.")
+    
+    with open(path_metadata, "r", encoding="utf-8") as f:
+        dict_ = json.load(f)
+    
+    return dict_
+
+
+def get_env_seed(env_name: str):
+    """This function ensures that you can reproduce the results of the computed scenarios.
+    
+    It forces the seeds of the environment, during evaluation to be the same as the one used during the evaluation of the score.
+    
+    As environments are stochastic in grid2op, it is very important that you use this function (or a similar one) before
+    computing the scores of your agent.
+
+    Args:
+        env_name (str): The environment name on which you want to retrieve the seeds used
+
+    Raises:
+        RuntimeError: When it is not possible to retrieve the seeds (for example when the "statistics" has not been computed)
+
+    Returns:
+        [type]: [description]
+    """
+
+    dict_ = _aux_get_env(env_name)
+    
+    key = "env_seeds"
+    if key not in dict_:
+        raise RuntimeError(f"Impossible to find the key {key} in the dictionnary. You should re run the score function.")
+    
+    return dict_[key]
+
+
+if __name__ == "__main__":
+    # create the environment 
+    env = grid2op.make(env_name)
+
+    # split into train / val / test
+    # it is such that there are 25 chronics for val and 24 for test
+    env.seed(1)
+    env.reset()
+    nm_train, nm_val, nm_test = env.train_val_split_random(add_for_test="test",
+                                                        pct_val=4.2,
+                                                        pct_test=4.2)
+
+    # computes some statistics for val / test to compare performance of 
+    # some agents with the do nothing for example
+    max_int = max_int = np.iinfo(dt_int).max
+    for nm_ in [nm_val, nm_test]:
+        env_tmp = grid2op.make(nm_, backend=LightSimBackend())
+        nb_scenario = len(env_tmp.chronics_handler.subpaths)
+        print(f"{nm_}: {nb_scenario}")
+        my_score = ScoreICAPS2021(env_tmp,
+                                  nb_scenario=nb_scenario,
+                                  env_seeds=np.random.randint(low=0,
+                                                              high=max_int,
+                                                              size=nb_scenario,
+                                                              dtype=dt_int),
+                                  agent_seeds=[0 for _ in range(nb_scenario)],
+                                  verbose=verbose,
+                                  nb_process_stats=nb_process_stats,
+                                  )
+
+        # compute statistics for reco powerline
+        seeds = get_env_seed(nm_)
+        reco_powerline_agent = RecoPowerlineAgent(env_tmp.action_space)
+        stats_reco = EpisodeStatistics(env_tmp, name_stats=name_stats)
+        stats_reco.compute(nb_scenario=nb_scenario,
+                           agent=reco_powerline_agent,
+                           env_seeds=seeds)
diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
similarity index 94%
rename from examples/ppo_stable_baselines/2_train_agent.py
rename to examples/ppo_stable_baselines/B_train_agent.py
index 5ba6e43..6a8a6fb 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -16,9 +16,11 @@
 import re
 import numpy as np
 from grid2op.Reward import BaseReward
+from l2rpn_baselines.utils import GymEnvWithReco
 
 env_name = "l2rpn_icaps_2021_small_train"
 save_path = "./saved_model"
+gymenv_class = GymEnvWithReco
 
 # customize the reward function (optional)
 class CustomReward(BaseReward):
@@ -99,7 +101,6 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     from l2rpn_baselines.PPO_SB3 import train
     from lightsim2grid import LightSimBackend  # highly recommended !
     from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
-    from l2rpn_baselines.utils import GymEnvWithReco
     
     obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour",
                         "gen_p", "load_p", "p_or",
@@ -108,10 +109,10 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                         "curtailment", "gen_p_before_curtail"]
 
     act_attr_to_keep = ["redispatch", "curtail"]
-    nb_iter = 6_000_000
+    nb_iter = 6_000
     learning_rate = 3e-3
     net_arch = [300, 300, 300]
-    name = "expe_with_auto_reco_simplereward"
+    name = "expe_test"
     gamma = 0.999
     
     env = grid2op.make(env_name,
@@ -120,8 +121,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        chronics_class=MultifolderWithCache)
 
     obs = env.reset()
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
-    env.chronics_handler.real_data.set_filter(lambda x: True)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: True)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
@@ -141,7 +142,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             save_every_xxx_steps=min(nb_iter // 10, 100_000),
             verbose=1,
             gamma=0.999,
-            gymenv_class=GymEnvWithReco,
+            gymenv_class=gymenv_class,
             )
     
     print("After training, ")
diff --git a/examples/ppo_stable_baselines/3_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
similarity index 53%
rename from examples/ppo_stable_baselines/3_evaluate_trained_model.py
rename to examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 8bfd4ae..7f68aa2 100644
--- a/examples/ppo_stable_baselines/3_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -6,60 +6,30 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-import os
-import json
 import grid2op
-import numpy as np
 from grid2op.utils import ScoreICAPS2021
 from lightsim2grid import LightSimBackend
+from grid2op.gym_compat import GymEnv
+from grid2op.Agent import RecoPowerlineAgent
+
 from l2rpn_baselines.PPO_SB3 import evaluate
 
+from A_prep_env import _aux_get_env, get_env_seed, name_stats
+from B_train_agent import gymenv_class
 
 env_name = "l2rpn_icaps_2021_small_val"
+agent_name = "expe_test"
 nb_scenario = 25
 nb_process_stats = 1
 load_path = "./saved_model"
 
 
-def _aux_get_env(env_name, dn=True):
-    path_ = grid2op.get_current_local_dir()
-    path_env = os.path.join(path_, env_name)
-    if not os.path.exists(path_env):
-        raise RuntimeError(f"The environment \"{env_name}\" does not exist.")
-    
-    path_dn = os.path.join(path_env, "_statistics_icaps2021_dn")
-    if not os.path.exists(path_dn):
-        raise RuntimeError("The folder _statistics_icaps2021_dn used for computing the score do not exist")
-    path_reco = os.path.join(path_env, "_statistics_l2rpn_no_overflow_reco")
-    if not os.path.exists(path_reco):
-        raise RuntimeError("The folder _statistics_l2rpn_no_overflow_reco used for computing the score do not exist")
-    if dn:
-        path_metadata = os.path.join(path_dn, "metadata.json")
-    else:
-        path_metadata = os.path.join(path_reco, "metadata.json")
-        
-    if not os.path.exists(path_metadata):
-        raise RuntimeError("The folder _statistics_icaps2021_dn does not appear to be a score folder")
-    
-    with open(path_metadata, "r", encoding="utf-8") as f:
-        dict_ = json.load(f)
-    
-    return dict_
-
-def get_env_seed(env_name):
-    dict_ = _aux_get_env(env_name)
-    
-    key = "env_seeds"
-    if key not in dict_:
-        raise RuntimeError(f"Impossible to find the key {key} in the dictionnary. You should re run the score function.")
-    
-    return dict_[key]
-
-def load_agent(env, load_path, name):
+def load_agent(env, load_path, name, gymenv_class=GymEnv):
     trained_agent, _ = evaluate(env,
                                 nb_episode=0,
                                 load_path=load_path,
-                                name=name)
+                                name=name,
+                                gymenv_class=gymenv_class)
     return trained_agent
 
 
@@ -71,25 +41,44 @@ def get_ts_survived_dn(env_name):
         res.append(tmp_)
     return res
 
+def get_ts_survived_reco(env_name):
+    dict_ = _aux_get_env(env_name, name_stat=name_stats)
+    res = []
+    for kk in range(nb_scenario):
+        tmp_ = dict_[f"{kk}"]["nb_step"]
+        res.append(tmp_)
+    return res
+
 
 if __name__ == "__main__":
-    name = "expe_0"
     
     #
     env_val = grid2op.make(env_name, backend=LightSimBackend())
+    dn_ts_survived = get_ts_survived_dn(env_name)
+    reco_ts_survived = get_ts_survived_reco(env_name)
+    
     my_score = ScoreICAPS2021(env_val,
                               nb_scenario=nb_scenario,
-                              env_seeds=get_env_seed(env_name),
+                              env_seeds=get_env_seed(env_name)[:nb_scenario],
                               agent_seeds=[0 for _ in range(nb_scenario)],
                               verbose=False,
                               nb_process_stats=nb_process_stats,
                               )
 
-    my_agent = load_agent(env_val, load_path=load_path, name=name)
+    my_agent = load_agent(env_val, load_path=load_path, name=agent_name, gymenv_class=gymenv_class)
+    my_agent = RecoPowerlineAgent(env_val.action_space)
     _, ts_survived, _ = my_score.get(my_agent)
-    dn_ts_survived = get_ts_survived_dn(env_name)
+    
+    # compare with do nothing
     best_than_dn = 0
     for my_ts, dn_ts in zip(ts_survived, dn_ts_survived):
         print(f"\t{':-)' if my_ts >= dn_ts else ':-('} I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})")
         best_than_dn += my_ts >= dn_ts
-    print(f"The agent \"{name}\" beats do nothing in {best_than_dn} out of {len(dn_ts_survived)} episodes")
+    print(f"The agent \"{agent_name}\" beats \"do nothing\" baseline in {best_than_dn} out of {len(dn_ts_survived)} episodes")
+    
+    # compare with reco powerline
+    best_than_reco = 0
+    for my_ts, reco_ts in zip(ts_survived, dn_ts_survived):
+        print(f"\t{':-)' if my_ts >= reco_ts else ':-('} I survived {my_ts} steps vs {reco_ts} for reco powerline ({my_ts - reco_ts})")
+        best_than_reco += my_ts >= reco_ts
+    print(f"The agent \"{agent_name}\" beats \"reco powerline\" baseline in {best_than_reco} out of {len(dn_ts_survived)} episodes")
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
index 53e3be1..2745987 100644
--- a/examples/ppo_stable_baselines/ReadMe.md
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -17,8 +17,25 @@ It is organized as follow:
 
 ## 1 Preparing the training environment
 
+This is done by running the script `A_prep_env.py` 
+
+In this phase, we do 3 things:
+
+- we split the data set into a training, validation and test set. This is quite standard in ML (less in RL) and its main goal is to prevent overfitting. (we remind the scenarios on codalab will be different from the training set provided, though drawn from the same distribution)
+- we initialize the computation of the scores. In the case of l2rpn competitions, the score is cannot be easily made into a reward function, it can only be computed when knowing the entire episode, at the end of the episode\*. 
+- we compute the score of a few "standard" baselines to compared the trained agent with
+
+\* of course you can make a sparse reward from it. Your agent receive always 0.0 unless when "done = True" (so last step of the episode) where this score can be computed. This is not the approach we took here.
+
 ## 2 Training the agent
 
+In this phase TODO
+
+
 ## 3 evaluate the agent
 
+TODO
+
 ## 4 preparing the submision
+
+TODO
\ No newline at end of file
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index 10038f2..f60b776 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -9,11 +9,10 @@
 import os
 import json
 from grid2op.Runner import Runner
+from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace, GymEnv
+from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics
 
 from l2rpn_baselines.utils.save_log_gif import save_log_gif
-
-from grid2op.gym_compat import BoxGymActSpace, BoxGymObsSpace
-
 from l2rpn_baselines.PPO_SB3.utils import SB3Agent
 
 
@@ -26,6 +25,8 @@ def evaluate(env,
              max_steps=-1,
              verbose=False,
              save_gif=False,
+             gymenv_class=GymEnv,
+             gymenv_kwargs=None,
              **kwargs):
     """
     This function will use stable baselines 3 to evaluate a previously trained
@@ -73,6 +74,12 @@ def evaluate(env,
         Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
         take a lot of ram) and drastically increase computation time.
 
+    gymenv_class: 
+        The class to use as a gym environment. By default `GymEnv` (from module grid2op.gym_compat)
+    
+    gymenv_kwargs: ``dict``
+        Extra key words arguments to build the gym environment.
+        
     kwargs:
         extra parameters passed to the PPO from stable baselines 3
 
@@ -159,13 +166,26 @@ def evaluate(env,
     if os.path.exists(os.path.join(load_path, ".normalize_obs")):
         for attr_nm in obs_attr_to_keep:
             gym_observation_space.normalize_attr(attr_nm)
-            
+    
+    gymenv = None
+    if gymenv_class is not None and issubclass(gymenv_class, GymEnvWithHeuristics):
+        if gymenv_kwargs is None:
+            gymenv_kwargs = {}
+        gymenv = gymenv_class(env, **gymenv_kwargs)
+        
+        gymenv.action_space.close()
+        gymenv.action_space = gym_action_space
+        
+        gymenv.observation_space.close()
+        gymenv.observation_space = gym_observation_space
+        
     # create a grid2gop agent based on that (this will reload the save weights)
     full_path = os.path.join(load_path, name)
     grid2op_agent = SB3Agent(env.action_space,
                              gym_action_space,
                              gym_observation_space,
-                             nn_path=os.path.join(full_path, name)
+                             nn_path=os.path.join(full_path, name),
+                             gymenv=gymenv,
                              )
 
     if nb_episode == 0:
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index 69b3b80..ecad643 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -132,7 +132,7 @@ def train(env,
         manipulate numbers between 0 and 1)
     
     gymenv_class: 
-        The class to use as a gym environment.
+        The class to use as a gym environment. By default `GymEnv` (from module grid2op.gym_compat)
     
     gymenv_kwargs: ``dict``
         Extra key words arguments to build the gym environment.
diff --git a/l2rpn_baselines/PPO_SB3/utils.py b/l2rpn_baselines/PPO_SB3/utils.py
index 0a1284c..82081df 100644
--- a/l2rpn_baselines/PPO_SB3/utils.py
+++ b/l2rpn_baselines/PPO_SB3/utils.py
@@ -132,6 +132,8 @@ class SB3Agent(GymAgent):
     
     Exactly one of `nn_path` and `nn_kwargs` should be provided. No more, no less.
     
+    TODO heuristic part !
+    
     Examples
     ---------
     
@@ -209,6 +211,7 @@ def __init__(self,
                  nn_path=None,
                  nn_kwargs=None,
                  custom_load_dict=None,
+                 gymenv=None
                  ):
         self._nn_type = nn_type
         if custom_load_dict is not None:
@@ -216,7 +219,9 @@ def __init__(self,
         else:
             self.custom_load_dict = {}
         super().__init__(g2op_action_space, gym_act_space, gym_obs_space,
-                         nn_path=nn_path, nn_kwargs=nn_kwargs)
+                         nn_path=nn_path, nn_kwargs=nn_kwargs,
+                         gymenv=gymenv
+                         )
         
     def get_act(self, gym_obs, reward, done):
         """Retrieve the gym action from the gym observation and the reward. 
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index fb4e01a..9d700e8 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -8,11 +8,14 @@
 
 from abc import abstractmethod
 import copy
+from typing import List, Optional
 
 from grid2op.Agent import BaseAgent
 from grid2op.Observation import BaseObservation
 from grid2op.Action import BaseAction
 
+from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics
+
 
 class GymAgent(BaseAgent):
     """
@@ -25,6 +28,8 @@ class GymAgent(BaseAgent):
     Use it only with a trained agent. It does not provide the "save" method and
     is not suitable for training.
     
+    TODO heuristic part !
+    
     ..info::
         To load a previously saved agent the function `GymAgent.load` will be called
         and you must provide the `nn_path` keyword argument.
@@ -33,6 +38,9 @@ class GymAgent(BaseAgent):
         you must provide the `nn_kwargs` keyword argument.
         
         You cannot set both, you have to set one.
+        
+    TODO example !!!
+    
     """
     def __init__(self,
                  g2op_action_space,
@@ -41,11 +49,21 @@ def __init__(self,
                  *,  # to prevent positional argument
                  nn_path=None,
                  nn_kwargs=None,
+                 gymenv=None,
                  _check_both_set=True,
                  _check_none_set=True):
         super().__init__(g2op_action_space)
         self._gym_act_space = gym_act_space
         self._gym_obs_space = gym_obs_space
+        
+        self._has_heuristic : bool = False
+        self.gymenv : Optional[GymEnvWithHeuristics] = gymenv
+        self._action_list : Optional[List] = None
+        
+        if self.gymenv is not None and isinstance(self.gymenv, GymEnvWithHeuristics):
+            self._has_heuristic = True
+            self._action_list = []
+            
         if _check_none_set and (nn_path is None and nn_kwargs is None):
             raise RuntimeError("Impossible to build a GymAgent without providing at "
                                "least one of `nn_path` (to load the agent from disk) "
@@ -95,7 +113,21 @@ def build(self):
         ..info:: Only called if the agent has been build with `nn_path=None` and `nn_kwargs` not None
         """
         pass
+    
+    def clean_heuristic_actions(self, observation: BaseObservation, reward: float, done: bool) -> None:
+        """This function allows to cure the heuristic actions. 
         
+        It is called at each step, just after the heuristic actions are computed (but before they are selected).
+        
+        It can be used, for example, to reorder the `self._action_list` for example.
+
+        Args:
+            observation (BaseObservation): The current observation
+            reward (float): the current reward
+            done (bool): the current flag "done"
+        """
+        pass
+    
     def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAction:
         """This function is called to "map" the grid2op world
         into a usable format by a neural networks (for example in a format
@@ -122,7 +154,24 @@ def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAc
         
         In this case the "gym agent" will only be used in particular settings.
         """
-        gym_obs = self._gym_obs_space.to_gym(observation)
-        gym_act = self.get_act(gym_obs, reward, done)
-        grid2op_act = self._gym_act_space.from_gym(gym_act)
+        grid2op_act = None
+        
+        # heuristic part
+        if self._has_heuristic:
+            if not self._action_list:
+                # the list of actions is empty, i querry the heuristic to see if there's something I can do
+                self._action_list = self.gymenv.heuristic_actions(observation, reward, done, {})
+                
+            self.clean_heuristic_actions(observation, reward, done)
+            if self._action_list:
+                # some heuristic actions have been selected, i select the first one
+                grid2op_act = self._action_list.pop(0)
+        
+        # the heursitic did not select any actions, then ask the NN to do one !
+        if grid2op_act is None:
+            # gym_obs = self._gym_obs_space.to_gym(observation)
+            # gym_act = self.get_act(gym_obs, reward, done)
+            # grid2op_act = self._gym_act_space.from_gym(gym_act)
+            grid2op_act = self.action_space()
+            
         return grid2op_act
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index ecb2d56..41124be 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -14,6 +14,7 @@
 from grid2op.Action import BaseAction
 from grid2op.gym_compat import GymEnv
 
+
 class GymEnvWithHeuristics(GymEnv):
     """This abstract class is used to perform some actions, independantly of a RL
     agent on a grid2op environment.
@@ -28,7 +29,7 @@ def heuristic_actions(self,
                           reward: float,
                           done: bool,
                           info: Dict) -> List[BaseAction]:
-        return g2op_obs, reward, done, info
+        return []
     
     def apply_heuristics_actions(self,
                                  g2op_obs: BaseObservation,

From c0eba0e9155592e5557a8a0b570154d4d4dd45dc Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Thu, 3 Feb 2022 15:56:36 +0100
Subject: [PATCH 29/56] updating documentation

---
 docs/external_contributions.rst | 208 +++++++++++++++++++++++++++++++-
 setup.py                        |   2 +-
 2 files changed, 205 insertions(+), 5 deletions(-)

diff --git a/docs/external_contributions.rst b/docs/external_contributions.rst
index dc2aa27..2e3d2ff 100644
--- a/docs/external_contributions.rst
+++ b/docs/external_contributions.rst
@@ -8,25 +8,225 @@ Description
 In this section you can find some examples made by other persons that are 
 included into l2rpn-baselines if you download it with github.
 
-You can find more information in the associated github.
+You can find more information in the associated github repositories.
+
+========================  ==================================
+Baseline                  note
+========================  ==================================
+:ref:`asynch`             🥈 @ WCCI 2019
+:ref:`binbinchen`         🥈 @ Neurips 2020 (robustness)
+:ref:`geirina`            🥇 @ WCCI 2019
+:ref:`indigosix`          4️⃣ @ ICAPS 2021 
+:ref:`kaist`              🥇 @ WCCI 2020
+:ref:`lujixiang`          🥉 @ Neurips 2020 (robustness)
+:ref:`lujixiang`          🥉 @ Neurips 2020 (adaptability)
+:ref:`magicpowergrids`    Neurips 2020 (robustness)
+:ref:`mazerl`             🥉 @ ICAPS 2021 
+:ref:`ppopf`              General optimization model 
+:ref:`rccccc`             Neurips 2020 (robustness) 
+:ref:`rlagnet`            🥇 @ Neurips 2020 (robustness)
+:ref:`rlagnet`            🥇 @ Neurips 2020 (adaptability)
+:ref:`supremaciachina`    🥈 @ ICAPS 2021 
+:ref:`taka`               Neurips 2020 (robustness) 
+:ref:`taka`               Neurips 2020 (adaptability) 
+:ref:`tonys`              Neurips 2020 (adaptability) 
+:ref:`unaigridoperator`   Neurips 2020 (robustness) 
+:ref:`unaigridoperator`   Neurips 2020 (adaptability) 
+:ref:`xdsilly`            🥇 @ ICAPS 2021 
+:ref:`yzmneurips`         Neurips 2020 (robustness)
+:ref:`yzmwcci`            🥉 @ WCCI 2020
+========================  ==================================
+
+.. note::
+    They are sorted alphabatically.
+
+.. warning::
+    We do not maintain any of these repositories. If you have trouble to make them work
+    please contact directly their authors.
+
+.. _asynch:
 
 AsynchronousActorCritic 
 -------------------------
 github: https://github.com/KishanGitASU/A3C-RL-baseline-agent-for-Grid2Op-environment.git
 
+.. note::
+    2nd at the first L2RPN competion in 2019 (team "*Learning_RL*")
+
+.. _binbinchen:
+
+Binbinchen
+------------
+
+github: https://github.com/AsprinChina/L2RPN_NIPS_2020_a_PPO_Solution
+
+.. note::
+    2nd at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*binbinchen*")
+
+.. _geirina:
+
 Geirina
 -----------
 guthub: https://github.com/djmax008/GEIRINA_baseline
 
+.. note::
+    1st at the first L2RPN competion in 2019 (team "*Geirina*")
+
+.. _indigosix:
+
+IndigoSix
+----------
+
+github: https://github.com/qsrmlhj/L2RPN-2021
+
+.. note::
+    4th at the fourth L2RPN competion in 2021 for ICAPS conference (team "*IndigoSix*")
+
+.. _kaist:
+
 Kaist
 -------
 github: https://github.com/sunghoonhong/L2RPN-WCCI-2020-Winner
 
+.. note::
+    1st at the second L2RPN competion in 2020 for WCCI conference (team "*shhong*")
+
+.. _lujixiang:
+
+Lujixiang (2 submissions)
+--------------------------
+
+github: https://github.com/lujasone/NeurIPS_2020_L2RPN_Comp_An_Approach
+
+paper: https://github.com/lujasone/NeurIPS_2020_L2RPN_Comp_An_Approach/blob/main/NeurIPS-competition_winnning_apprach.pdf
+
+.. note::
+    3rd at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*lujixiang*")
+
+    3rd at the third L2RPN competion in 2020 for Neurips conference, track adaptability (team "*lujixiang*")
+
+.. _magicpowergrids:
+
+MagicPowerGrids
+----------------
+
+github: https://github.com/rl-developer/l2rpn-expert_experience_strategy
+
+.. note::
+    Participated in the third L2RPN competion in 2020 for Neurips conference, track robustness
+
+.. _mazerl:
+
+Maze-rl
+--------
+
+github: https://github.com/enlite-ai/maze-l2rpn-2021-submission 
+
+.. note::
+    3rd at the fourth L2RPN competion in 2021 for ICAPS conference (team "*Maze-rl*")
+
+.. _ppopf:
+
 PandapowerOPFAgent
 --------------------
 github: https://github.com/jhmenke/grid2op_pp_baseline
 
+.. note::
+    This beaseline does not use any RL at all and is purely based on step by step optimization.
 
-.. warning::
-    We do not maintain any of these repository. If you have trouble to make them work
-    please contact directly their authors.
\ No newline at end of file
+.. _rccccc:
+
+Rccccc.
+---------
+
+github: https://github.com/eexuyan/A-Rainbow-based-agent-for-L2RPN
+
+.. note::
+    Participated in the third L2RPN competion in 2020 for Neurips conference, track robustness
+
+.. _rlagnet:
+
+RL\_Agnet (2 submissions)
+-------------------------
+github: https://github.com/PaddlePaddle/PARL/tree/develop/benchmark/torch/NeurIPS2020-Learning-to-Run-a-Power-Network-Challenge
+
+paper: https://arxiv.org/pdf/2106.15200.pdf
+
+.. note::
+    1st at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*rl_agnet*")
+
+    1st at the third L2RPN competion in 2020 for Neurips conference, track adaptability (team "*rl_agnet*")
+
+.. _supremaciachina:
+
+SupremaciaChina
+----------------
+
+github: https://github.com/horacioMartinez/L2RPN
+
+.. note::
+    2nd at the fourth L2RPN competion in 2021 for ICAPS conference (team "*SupremaciaChina*")
+
+.. _taka:
+
+Taka
+-----
+
+github: https://gitlab.com/motoki.saitama/l2rpn
+
+.. note::
+    Participated in the third L2RPN competion in 2020 for Neurips conference, track robustness
+
+    Participated in the third L2RPN competion in 2020 for Neurips conference, track adaptability
+
+.. _tonys:
+
+TonyS
+--------
+
+github: https://github.com/EricLDS/l2rpn-public-submission
+
+.. note::
+    Participated in the third L2RPN competion in 2020 for Neurips conference, track adaptability
+
+.. _unaigridoperator:
+
+UN-aiGridOperator (2 submissions)
+-----------------------------------
+
+github: https://github.com/UnAIOperator/L2RPN-NEURIPS-2020
+
+.. note::
+    7th at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*UN-aiGridOperator*")
+
+    5th at the third L2RPN competion in 2020 for Neurips conference, track adaptability (team "*UN-aiGridOperator*")
+
+.. _xdsilly:
+
+Xd_Silly
+----------
+
+github:  https://github.com/polixir/L2RPN_2021
+
+.. note::
+    1st at the fourth L2RPN competion in 2021 for ICAPS conference (team "*Xd_Silly*")
+
+.. _yzmneurips:
+
+YZM_Test - Neurips 2020
+------------------------
+
+github: https://github.com/ZM-Learn/L2RPN_WCCI_a_Solution
+
+.. note::
+    9th at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*yzm_test*")
+
+.. _yzmwcci:
+
+YZM_Test - WCCI 2020
+----------------------
+
+github: https://github.com/ZM-Learn/L2RPN_WCCI_a_Solution
+
+.. note::
+    3rd at the second L2RPN competion in 2020 for WCCI conference (team "*yzm_test*")
diff --git a/setup.py b/setup.py
index 2284255..254fd5d 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
             "sphinx>=2.4.4",
             "sphinx-rtd-theme>=0.4.3",
             "sphinxcontrib-trio>=1.1.0",
-            "autodocsumm>=0.1.13"
+            "autodocsumm>=0.2.7"
         ],
         "optional": ["grid2op[optional]>=1.6.5",
                      "tensorflow>=2.2.0",

From 85f200f7a8ebb12902795a276136bf8ef4d57901 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Thu, 3 Feb 2022 18:27:35 +0100
Subject: [PATCH 30/56] add some contributions in the docs, improving the
 heuristic part

---
 docs/external_contributions.rst               | 32 +++++++-------
 examples/ppo_stable_baselines/A_prep_env.py   |  2 -
 .../ppo_stable_baselines/B_train_agent.py     |  7 +---
 .../C_evaluate_trained_model.py               | 17 +++++---
 l2rpn_baselines/utils/__init__.py             |  5 ++-
 l2rpn_baselines/utils/gymAgent.py             |  7 ++--
 l2rpn_baselines/utils/gymenv_custom.py        | 42 ++++++++++++++++---
 7 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/docs/external_contributions.rst b/docs/external_contributions.rst
index 2e3d2ff..b650001 100644
--- a/docs/external_contributions.rst
+++ b/docs/external_contributions.rst
@@ -8,10 +8,16 @@ Description
 In this section you can find some examples made by other persons that are 
 included into l2rpn-baselines if you download it with github.
 
-You can find more information in the associated github repositories.
+.. note::
+    If you want your code to be part of this list, please let us know with a github issue, 
+    for example by filling an issue template at the official l2rpn repository: https://github.com/rte-france/l2rpn-baselines/issues .
+    (this is also the link if you notice any error on this page)
+
+You can find more information in the associated github repositories (links available in the subsection dedicated to the 
+baseline).
 
 ========================  ==================================
-Baseline                  note
+Baseline                  Notes
 ========================  ==================================
 :ref:`asynch`             🥈 @ WCCI 2019
 :ref:`binbinchen`         🥈 @ Neurips 2020 (robustness)
@@ -33,8 +39,8 @@ Baseline                  note
 :ref:`unaigridoperator`   Neurips 2020 (robustness) 
 :ref:`unaigridoperator`   Neurips 2020 (adaptability) 
 :ref:`xdsilly`            🥇 @ ICAPS 2021 
-:ref:`yzmneurips`         Neurips 2020 (robustness)
-:ref:`yzmwcci`            🥉 @ WCCI 2020
+:ref:`yzm`                Neurips 2020 (robustness)
+:ref:`yzm`                🥉 @ WCCI 2020
 ========================  ==================================
 
 .. note::
@@ -211,22 +217,14 @@ github:  https://github.com/polixir/L2RPN_2021
 .. note::
     1st at the fourth L2RPN competion in 2021 for ICAPS conference (team "*Xd_Silly*")
 
-.. _yzmneurips:
-
-YZM_Test - Neurips 2020
-------------------------
-
-github: https://github.com/ZM-Learn/L2RPN_WCCI_a_Solution
-
-.. note::
-    9th at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*yzm_test*")
-
-.. _yzmwcci:
+.. _yzmn:
 
-YZM_Test - WCCI 2020
-----------------------
+YZM_Test (2 submissions)
+-------------------------
 
 github: https://github.com/ZM-Learn/L2RPN_WCCI_a_Solution
 
 .. note::
     3rd at the second L2RPN competion in 2020 for WCCI conference (team "*yzm_test*")
+
+    9th at the third L2RPN competion in 2020 for Neurips conference, track robustness (team "*yzm_test*")
diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index 2d1c8ee..4b85220 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -45,8 +45,6 @@ def _aux_get_env(env_name, dn=True, name_stat=None):
         path_stat = os.path.join(path_env, EpisodeStatistics.get_name_dir(name_stat))
         if not os.path.exists(path_stat):
             raise RuntimeError(f"No folder associated with statistics {name_stat}")
-        import pdb
-        pdb.set_trace()
         path_metadata = os.path.join(path_stat, "metadata.json")
     
     if not os.path.exists(path_metadata):
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 6a8a6fb..5e96606 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -16,11 +16,11 @@
 import re
 import numpy as np
 from grid2op.Reward import BaseReward
-from l2rpn_baselines.utils import GymEnvWithReco
+from l2rpn_baselines.utils import GymEnvWithReco, GymEnvWithRecoWithDN
 
 env_name = "l2rpn_icaps_2021_small_train"
 save_path = "./saved_model"
-gymenv_class = GymEnvWithReco
+gymenv_class = GymEnvWithRecoWithDN
 
 # customize the reward function (optional)
 class CustomReward(BaseReward):
@@ -144,6 +144,3 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             gamma=0.999,
             gymenv_class=gymenv_class,
             )
-    
-    print("After training, ")
-    # TODO evaluate it !
\ No newline at end of file
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 7f68aa2..9eae388 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -6,6 +6,8 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+import numpy as np
+
 import grid2op
 from grid2op.utils import ScoreICAPS2021
 from lightsim2grid import LightSimBackend
@@ -39,6 +41,8 @@ def get_ts_survived_dn(env_name):
     for kk in range(nb_scenario):
         tmp_ = dict_[f"{kk}"]["nb_step"]
         res.append(tmp_)
+    res = np.array(res)
+    res -= 1  # the first observation (after reset) is counted as a step in the runner
     return res
 
 def get_ts_survived_reco(env_name):
@@ -47,16 +51,20 @@ def get_ts_survived_reco(env_name):
     for kk in range(nb_scenario):
         tmp_ = dict_[f"{kk}"]["nb_step"]
         res.append(tmp_)
+    res = np.array(res)
+    res -= 1  # the first observation (after reset) is counted as a step in the runner
     return res
 
 
 if __name__ == "__main__":
     
-    #
+    # create the environment
     env_val = grid2op.make(env_name, backend=LightSimBackend())
+    
+    # retrieve the reference data
     dn_ts_survived = get_ts_survived_dn(env_name)
     reco_ts_survived = get_ts_survived_reco(env_name)
-    
+
     my_score = ScoreICAPS2021(env_val,
                               nb_scenario=nb_scenario,
                               env_seeds=get_env_seed(env_name)[:nb_scenario],
@@ -66,7 +74,6 @@ def get_ts_survived_reco(env_name):
                               )
 
     my_agent = load_agent(env_val, load_path=load_path, name=agent_name, gymenv_class=gymenv_class)
-    my_agent = RecoPowerlineAgent(env_val.action_space)
     _, ts_survived, _ = my_score.get(my_agent)
     
     # compare with do nothing
@@ -78,7 +85,7 @@ def get_ts_survived_reco(env_name):
     
     # compare with reco powerline
     best_than_reco = 0
-    for my_ts, reco_ts in zip(ts_survived, dn_ts_survived):
+    for my_ts, reco_ts in zip(ts_survived, reco_ts_survived):
         print(f"\t{':-)' if my_ts >= reco_ts else ':-('} I survived {my_ts} steps vs {reco_ts} for reco powerline ({my_ts - reco_ts})")
         best_than_reco += my_ts >= reco_ts
-    print(f"The agent \"{agent_name}\" beats \"reco powerline\" baseline in {best_than_reco} out of {len(dn_ts_survived)} episodes")
+    print(f"The agent \"{agent_name}\" beats \"reco powerline\" baseline in {best_than_reco} out of {len(reco_ts_survived)} episodes")
diff --git a/l2rpn_baselines/utils/__init__.py b/l2rpn_baselines/utils/__init__.py
index 6419b2d..97ef8a0 100644
--- a/l2rpn_baselines/utils/__init__.py
+++ b/l2rpn_baselines/utils/__init__.py
@@ -20,8 +20,9 @@
     "BaseDeepQ",
     "DeepQAgent",
     "GymAgent",
+    "GymEnvWithHeuristics",
     "GymEnvWithReco",
-    "GymEnvWithHeuristics"
+    "GymEnvWithRecoWithDN",
 ]
 
 from l2rpn_baselines.utils.cli_eval import cli_eval
@@ -37,4 +38,4 @@
 from l2rpn_baselines.utils.baseDeepQ import BaseDeepQ
 from l2rpn_baselines.utils.deepQAgent import DeepQAgent
 from l2rpn_baselines.utils.gymAgent import GymAgent
-from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics, GymEnvWithReco
+from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics, GymEnvWithReco, GymEnvWithRecoWithDN
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 9d700e8..8037367 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -169,9 +169,8 @@ def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAc
         
         # the heursitic did not select any actions, then ask the NN to do one !
         if grid2op_act is None:
-            # gym_obs = self._gym_obs_space.to_gym(observation)
-            # gym_act = self.get_act(gym_obs, reward, done)
-            # grid2op_act = self._gym_act_space.from_gym(gym_act)
-            grid2op_act = self.action_space()
+            gym_obs = self._gym_obs_space.to_gym(observation)
+            gym_act = self.get_act(gym_obs, reward, done)
+            grid2op_act = self._gym_act_space.from_gym(gym_act)
             
         return grid2op_act
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index 41124be..de21a25 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -36,12 +36,18 @@ def apply_heuristics_actions(self,
                                  reward: float,
                                  done: bool,
                                  info: Dict ) -> Tuple[BaseObservation, float, bool, Dict]:
-        g2op_actions = self.heuristic_actions(g2op_obs, reward, done, info)
-        for g2op_act in g2op_actions:
-            tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
-            g2op_obs = tmp_obs
-            done = tmp_done
-            if tmp_done:
+        need_action = True
+        while need_action:
+            need_action = False
+            g2op_actions = self.heuristic_actions(g2op_obs, reward, done, info)
+            for g2op_act in g2op_actions:
+                need_action = True
+                tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
+                g2op_obs = tmp_obs
+                done = tmp_done
+                if tmp_done:
+                    break
+            if done:
                 break
         return g2op_obs, reward, done, info
     
@@ -83,3 +89,27 @@ def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
                 g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
                 res.append(g2op_act)
         return res
+        
+    
+class GymEnvWithRecoWithDN(GymEnvWithHeuristics):
+    """[summary]
+
+    Parameters
+    ----------
+    GymEnv : [type]
+        [description]
+    """
+    def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
+        to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
+        res = []
+        if np.any(to_reco):
+            # reconnect something if it can be
+            reco_id = np.where(to_reco)[0]
+            for line_id in reco_id:
+                g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
+                res.append(g2op_act)
+        elif g2op_obs.rho.max() <= 0.9:
+            # play do nothing if there is no problem
+            res = [self.init_env.action_space()]
+            
+        return res

From 9431b99862c00e61b5598a7eb42ee5a2c93cd0c1 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Thu, 3 Feb 2022 20:06:32 +0100
Subject: [PATCH 31/56] expe test

---
 examples/ppo_stable_baselines/2_train_agent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ppo_stable_baselines/2_train_agent.py b/examples/ppo_stable_baselines/2_train_agent.py
index 5ba6e43..0a071da 100644
--- a/examples/ppo_stable_baselines/2_train_agent.py
+++ b/examples/ppo_stable_baselines/2_train_agent.py
@@ -84,9 +84,9 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         score_state = 0.
 
         # score close to goal
-        # score_goal = 0.
+        score_goal = 0.
         # score_goal = env.nb_time_step / env.max_episode_duration()
-        score_goal = 1.0
+        # score_goal = 1.0
         
         # score too much redisp
         res = score_goal * (1.0 - 0.5 * (score_action + score_state))
@@ -111,7 +111,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     nb_iter = 6_000_000
     learning_rate = 3e-3
     net_arch = [300, 300, 300]
-    name = "expe_with_auto_reco_simplereward"
+    name = "expe_with_auto_reco_onlyend_ep"
     gamma = 0.999
     
     env = grid2op.make(env_name,

From 8c6984d44209daa31b53c8e630cab8058645287f Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Mon, 7 Feb 2022 12:04:24 +0100
Subject: [PATCH 32/56] starting to add some docs in the agent module

---
 .gitignore                             |   3 +
 Makefile                               |   2 +
 docs/external_contributions.rst        |   2 +-
 l2rpn_baselines/Kaist                  |   2 +-
 l2rpn_baselines/utils/gymAgent.py      |  38 ++++--
 l2rpn_baselines/utils/gymenv_custom.py | 168 ++++++++++++++++++++++++-
 6 files changed, 200 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5e218f6..ae1de2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -186,3 +186,6 @@ examples/ppo_stable_baselines/logs/**
 logs/
 saved_agents/
 test_train_DuelQSimple.py
+line_act.json
+tensorboard/
+test_sac/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 5582b98..e57a9d8 100644
--- a/Makefile
+++ b/Makefile
@@ -18,3 +18,5 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+#cmd windows: > sphinx-build -M html docs _doc_built
diff --git a/docs/external_contributions.rst b/docs/external_contributions.rst
index b650001..771cb02 100644
--- a/docs/external_contributions.rst
+++ b/docs/external_contributions.rst
@@ -217,7 +217,7 @@ github:  https://github.com/polixir/L2RPN_2021
 .. note::
     1st at the fourth L2RPN competion in 2021 for ICAPS conference (team "*Xd_Silly*")
 
-.. _yzmn:
+.. _yzm:
 
 YZM_Test (2 submissions)
 -------------------------
diff --git a/l2rpn_baselines/Kaist b/l2rpn_baselines/Kaist
index b2b6561..71c49e7 160000
--- a/l2rpn_baselines/Kaist
+++ b/l2rpn_baselines/Kaist
@@ -1 +1 @@
-Subproject commit b2b6561a2cc3afbf03fd13ef6d1b334e4ec6c98a
+Subproject commit 71c49e73ace272fd6d8258a5295abc2b8d3bea1b
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 8037367..2239626 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -28,18 +28,42 @@ class GymAgent(BaseAgent):
     Use it only with a trained agent. It does not provide the "save" method and
     is not suitable for training.
     
-    TODO heuristic part !
-    
-    ..info::
+    .. note::
         To load a previously saved agent the function `GymAgent.load` will be called
         and you must provide the `nn_path` keyword argument.
         
         To build a new agent, the function `GymAgent.build` is called and
         you must provide the `nn_kwargs` keyword argument.
-        
-        You cannot set both, you have to set one.
-        
-    TODO example !!!
+    
+    Examples
+    ---------
+    Some examples of such agents are provided in the classes:
+    
+    - :class:`l2rpn_baselines.PPO_SB3.PPO_SB3` that implements such an agent with the "stable baselines3" RL framework
+    - :class:`l2rpn_baselines.PPO_RLLIB.PPO_RLLIB` that implements such an agent with the "ray / rllib" RL framework
+    
+    Both can benefit from the feature of this class, most notably the possibility to include "heuristics" (such as: 
+    "if a powerline can be reconnected, do it" or "do not act if the grid is not in danger")
+    
+    Notes
+    -----
+    The main goal of this class is to be able to use "heuristics" (both for training and at inference time) quite simply
+    and with out of the box support of external libraries.
+    
+    All top performers in all l2rpn competitions (as of writing) used some kind of heuristics in their agent (such as: 
+    "if a powerline can be reconnected, do it" or "do not act if the grid is not in danger"). This is why we made some 
+    effort to develop a generic class that allows to train agents directly using these "heuristics".
+    
+    This features is split in two parts:
+    
+    - At training time, the "*heuristics*" are part of the environment. The agent will see only observations that are relevant
+      to it (and not the stat handled by the heuristic.)
+    - At inference time, the "*heuristics*" of the environment used to train the agent are included in the "agent.act" function.
+      If a heuristic has been used at training time, the agent will first "ask" the environment is a heuristic should be
+      performed on the grid (in this case it will do it) otherwise it will ask the underlying neural network what to do.
+    
+    Some examples are provided in the "examples" code (under the "examples/ppo_stable_baselines") repository that 
+    demonstrates the use of :class:`l2rpn_baselines.utils.GymEnvWithRecoWithDN` .
     
     """
     def __init__(self,
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index de21a25..e73d624 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -22,6 +22,36 @@ class GymEnvWithHeuristics(GymEnv):
     It can be used, for example, to train an agent (for example a deep-rl agent)
     if you want to use some heuristics at inference time (for example
     you reconnect every powerline that you can.)
+    
+    The heuristic you want to implement should be implemented in :func:`GymEnvWithHeuristics.heuristic_actions`.
+    
+    Examples
+    --------
+    Let's imagine, for example, that you want to implement an RL agent that performs actions on the grid. But you noticed
+    that your agent performs better if the all the powerlines are reconnected (which is often the case by the way).
+    
+    To that end, you want to force the reconnection of powerline each time it's possible. When it's not possible, you want to
+    let the neural network do what is best for the environment.
+    
+    Training an agent on such setting might be difficult and require recoding some (deep) part of the training framework (*eg*
+    stable-baselines). Unless... You use a dedicated "environment".
+    
+    In this environment (compatible, inheriting the base class `gym.Env`) will handle all the "heuristic" part and only show
+    the agent with the state where it should act.
+    
+    Basically a "step" happens like this:
+    
+    #. the agent issue an action (gym format)
+    #. the action (gym format) is decoded to a grid2op compatible action (thanks to the action_space)
+    #. this grid2op action is implemented on the grid (thanks to the underlying grid2op environment)
+       and the corresponding grid2op observation is generated
+    #. this observation is processed by the  :func:`GymEnvWithHeuristics.apply_heuristics_actions`: the grid2op_env.step
+       is called until the NN agent is require to take a decision (or the flag `done=True` is set)
+    #. the observation (corresponding to the last step above) is then converted to a gym action (thanks to the observation_space)
+       which is forwarded to the agent.
+    
+    The agent then only "sees" what is not processed by the heuristic. It is trained only on the relevant "state".
+
     """
     @abstractmethod
     def heuristic_actions(self,
@@ -29,6 +59,45 @@ def heuristic_actions(self,
                           reward: float,
                           done: bool,
                           info: Dict) -> List[BaseAction]:
+        """This function has the same signature as the "agent.act"  function. It allows to implement a heuristic.
+        
+        It can be called multiple times per "gymenv step" and is expect to return a list of grid2op actions (in the
+        correct order) to be done on the underlying grid2op environment. 
+
+        An implementation of such a function (for example) can be found at :func:`GymEnvWithReco.heuristic_actions` or
+        :func:`GymEnvWithRecoWithDN.heuristic_actions`
+        
+        This function can return a list of action that will "in turn" be executed on the grid. It is only after each 
+        and every actions that are returned that this function is called again.
+        
+        .. note::
+            You MUST return "[do_nothing]" if your heuristic chose to do nothing at a certain step. Otherwise (if
+            the returned list is empty "[]" the agent is asked to perform an action.)
+        
+        .. note::
+            We remind that inside a "gym env" step, a lot of "grid2op env" steps might be happening.
+            
+            As long as a heuristic action is selected (ie as long as this function does not return the empty list)
+            this action is performed on the grid2op environment.
+            
+        Parameters
+        ----------
+        g2op_obs : BaseObservation
+            [description]
+        reward : float
+            The last reward the agent (or the heuristic) had.
+            This is the `reward` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`
+        done : bool
+            Whether the environment is "done" or not. It should be "False" in most cases. 
+            This is the `done` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`
+        info : Dict
+            `info` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`
+
+        Returns
+        -------
+        List[BaseAction]
+            The ordered list of actions to implement, selected by the "heuristic" / "expert knowledge" / "automatic action".
+        """
         return []
     
     def apply_heuristics_actions(self,
@@ -36,10 +105,52 @@ def apply_heuristics_actions(self,
                                  reward: float,
                                  done: bool,
                                  info: Dict ) -> Tuple[BaseObservation, float, bool, Dict]:
+        """This function implements the "logic" behind the heuristic part. Unless you have a particular reason too, you
+        probably should not modify this function.
+        
+        If you modify it, you should also modify the way the agent implements it (remember: this function is used 
+        at training time, the "GymAgent" part is used at inference time. Both behaviour should match for the best
+        performance).
+
+        While there are "heuristics" / "expert rules" / etc. this function should perform steps in the underlying grid2op
+        environment.
+        
+        It is expected to return when:
+        
+        - either the flag `done` is ``True`` 
+        - or the neural network agent is asked to perform action on the grid
+        
+        The neural network agent will receive the outpout of this function. 
+        
+        Parameters
+        ----------
+        g2op_obs : BaseObservation
+            The grid2op observation.
+            
+        reward : ``float``
+            The reward
+            
+        done : ``bool``
+            The flag that indicates whether the environment is over or not.
+            
+        info : Dict
+            Other information flags
+
+        Returns
+        -------
+        Tuple[BaseObservation, float, bool, Dict]
+            It should return `obs, reward, done, info`(same as a single call to `grid2op_env.step(grid2op_act)`)
+            
+            Then, this will be transmitted to the neural network agent (but before the observation will be 
+            transformed to a gym observation thanks to the observation space.)
+            
+        """
         need_action = True
+        tmp_reward = reward
+        tmp_info = info
         while need_action:
             need_action = False
-            g2op_actions = self.heuristic_actions(g2op_obs, reward, done, info)
+            g2op_actions = self.heuristic_actions(g2op_obs, tmp_reward, done, tmp_info)
             for g2op_act in g2op_actions:
                 need_action = True
                 tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
@@ -48,21 +159,43 @@ def apply_heuristics_actions(self,
                 if tmp_done:
                     break
             if done:
+                reward = tmp_reward
                 break
         return g2op_obs, reward, done, info
     
     def step(self, gym_action):
-        """[summary]
+        """This function implements the special case of the "step" function (as seen by the "gym environment") that might
+        call multiple times the "step" function of the underlying "grid2op environment" depending on the
+        heuristic.
+        
+        It takes a gym action, convert it to a grid2op action (thanks to the action space).
+        
+        Then process the heuristics / expert rules / forced actions / etc. and return the next gym observation that will
+        be processed by the agent.
+        
+        The number of "grid2op steps" can vary between different "gym environment" call to "step".
+        
+        It has the same signature as the `gym.Env` "step" function, of course. 
 
         Parameters
         ----------
-        gym_action : [type]
-            [description]
+        gym_action :
+            the action (represented as a gym one) that the agent wants to perform.
 
         Returns
         -------
-        [type]
-            [description]
+        gym_obs:
+            The gym observation that will be processed by the agent
+            
+        reward: ``float``
+            The reward of the agent (that might be computed by the )
+            
+        done: ``bool``
+            Whether the episode is over or not
+            
+        info: Dict
+            Other type of informations
+            
         """
         g2op_act = self.action_space.from_gym(gym_action)
         g2op_obs, reward, done, info = self.init_env.step(g2op_act)
@@ -71,6 +204,29 @@ def step(self, gym_action):
         gym_obs = self.observation_space.to_gym(g2op_obs)
         return gym_obs, float(reward), done, info
         
+    def reset(self):
+        """This function implements the "reset" function. It is called at the end of every episode and
+        marks the beginning of a new one.
+        
+        Again, before the agents sees any observations from the environment, they are processed by the 
+        "heuristics" / "expert rules".
+        
+        .. note::
+            The first observation seen by the agent is not necessarily the first observation of the grid2op environment.
+
+        Returns
+        -------
+        gym_obs:
+            The first open ai gym observation received by the agent
+        """
+        super().reset()
+        g2op_obs = self.init_env.get_obs()
+        reward = self.init_env.reward_range[0]
+        done = False
+        info = {}
+        g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
+        gym_obs = self.observation_space.to_gym(g2op_obs)
+        return gym_obs
     
 class GymEnvWithReco(GymEnvWithHeuristics):
     """[summary]

From 025accc394443300a6d574674bcdc8e9006adc2b Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Mon, 7 Feb 2022 14:13:25 +0100
Subject: [PATCH 33/56] adding some docs for the gym env

---
 examples/ppo_stable_baselines/A_prep_env.py | 11 +++-
 l2rpn_baselines/utils/gymenv_custom.py      | 71 +++++++++++++++++----
 2 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index 4b85220..c2f7b7b 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -9,6 +9,7 @@
 # this file needs to be run only once, it might take a while !
 import os
 import json
+import sys
 import numpy as np
 import grid2op
 from grid2op.dtypes import dt_int
@@ -17,11 +18,14 @@
 from lightsim2grid import LightSimBackend
 import numpy as np
 
+is_windows = sys.platform.startswith("win32")
 
 env_name = "l2rpn_icaps_2021_small"
 name_stats = "_reco_powerline"
-nb_process_stats = 8
+nb_process_stats = 8 if not is_windows else 1
 verbose = 1
+deep_copy = is_windows  # force the deep copy on windows (due to permission issue in symlink in windows)
+
 
 def _aux_get_env(env_name, dn=True, name_stat=None):
     path_ = grid2op.get_current_local_dir()
@@ -92,8 +96,9 @@ def get_env_seed(env_name: str):
     env.seed(1)
     env.reset()
     nm_train, nm_val, nm_test = env.train_val_split_random(add_for_test="test",
-                                                        pct_val=4.2,
-                                                        pct_test=4.2)
+                                                           pct_val=4.2,
+                                                           pct_test=4.2,
+                                                           deep_copy=deep_copy)
 
     # computes some statistics for val / test to compare performance of 
     # some agents with the do nothing for example
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index e73d624..f7f19f4 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -229,17 +229,34 @@ def reset(self):
         return gym_obs
     
 class GymEnvWithReco(GymEnvWithHeuristics):
-    """[summary]
+    """This specific type of environment with "heuristics" / "expert rules" / "expert actions" is an
+    example to illustrate how to perfom an automatic powerline reconnection.
+    
+    For this type of environment the only heuristic implemented is the following: "each time i can
+    reconnect a powerline, i don't ask the agent, i reconnect it and send it the state after the powerline
+    has been reconnected".
 
-    Parameters
-    ----------
-    GymEnv : [type]
-        [description]
+    With the proposed class, implementing it is fairly easy as shown in function :func:`GymEnvWithReco.heuristic_actions`
+    
     """
     def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
+        """The heuristic is pretty simple: each there is a powerline with a cooldown at 0 and that is disconnected
+        the heuristic reconnects it.
+
+        Parameters
+        ----------
+        See parameters of :func:`GymEnvWithHeuristics.heuristic_actions`
+
+        Returns
+        -------
+        See return values of :func:`GymEnvWithHeuristics.heuristic_actions`
+        """
+        
+        # computes which powerline can be reconnected
         to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
         res = []
         if np.any(to_reco):
+            # If I can reconnect any, I do it
             reco_id = np.where(to_reco)[0]
             for line_id in reco_id:
                 g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
@@ -248,14 +265,42 @@ def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
         
     
 class GymEnvWithRecoWithDN(GymEnvWithHeuristics):
-    """[summary]
-
-    Parameters
-    ----------
-    GymEnv : [type]
-        [description]
+    """This environment is slightly more complex that the other one.
+    
+    It consists in 2 things:
+    
+    #. reconnecting the powerlines if possible
+    #. doing nothing is the state of the grid is "safe" (for this class, the notion of "safety" is pretty simple: if all
+       flows are bellow 90% (by default) of the thermal limit, then it is safe)
+    
+    If for a given step, non of these things is applicable, the underlying trained agent is asked to perform an action
+    
+    .. warning::
+        When using this environment, we highly recommend to adapt the parameter `safe_max_rho` to suit your need.
+        
+        Sometimes, 90% of the thermal limit is too high, sometimes it is too low.
+        
     """
+    def __init__(self, env_init, safe_max_rho=0.9):
+        super().__init__(env_init)
+        self._safe_max_rho = safe_max_rho
+        
     def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
+        """To match the description of the environment, this heuristic will:
+        
+        - return the list of all the powerlines that can be reconnected if any
+        - return the list "[do nothing]" is the grid is safe
+        - return the empty list (signaling the agent should take control over the heuristics) otherwise
+
+        Parameters
+        ----------
+        See parameters of :func:`GymEnvWithHeuristics.heuristic_actions`
+
+        Returns
+        -------
+        See return values of :func:`GymEnvWithHeuristics.heuristic_actions`
+        """
+        
         to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
         res = []
         if np.any(to_reco):
@@ -264,8 +309,8 @@ def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
             for line_id in reco_id:
                 g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
                 res.append(g2op_act)
-        elif g2op_obs.rho.max() <= 0.9:
-            # play do nothing if there is no problem
+        elif g2op_obs.rho.max() <= self._safe_max_rho:
+            # play do nothing if there is "no problem" according to the "rule of thumb"
             res = [self.init_env.action_space()]
             
         return res

From 1253ccc2c41902af83a32e33010785f736189cdb Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Mon, 7 Feb 2022 14:39:11 +0100
Subject: [PATCH 34/56] need to do the docs of the 'utils' module and then
 we're good to go

---
 .../ppo_stable_baselines/B_train_agent.py     |  5 +++--
 .../C_evaluate_trained_model.py               |  4 ++--
 l2rpn_baselines/utils/gymenv_custom.py        | 19 +++++++++++--------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 5e96606..45b6c85 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -20,6 +20,7 @@
 
 env_name = "l2rpn_icaps_2021_small_train"
 save_path = "./saved_model"
+name = "expe_test2"
 gymenv_class = GymEnvWithRecoWithDN
 
 # customize the reward function (optional)
@@ -109,10 +110,9 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                         "curtailment", "gen_p_before_curtail"]
 
     act_attr_to_keep = ["redispatch", "curtail"]
-    nb_iter = 6_000
+    nb_iter = 100
     learning_rate = 3e-3
     net_arch = [300, 300, 300]
-    name = "expe_test"
     gamma = 0.999
     
     env = grid2op.make(env_name,
@@ -126,6 +126,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
+    
     print("environment loaded !")
     trained_agent = train(
             env,
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 9eae388..094aa9d 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -17,10 +17,10 @@
 from l2rpn_baselines.PPO_SB3 import evaluate
 
 from A_prep_env import _aux_get_env, get_env_seed, name_stats
-from B_train_agent import gymenv_class
+from B_train_agent import gymenv_class, name
 
 env_name = "l2rpn_icaps_2021_small_val"
-agent_name = "expe_test"
+agent_name = name
 nb_scenario = 25
 nb_process_stats = 1
 load_path = "./saved_model"
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index f7f19f4..98008ac 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -219,13 +219,17 @@ def reset(self):
         gym_obs:
             The first open ai gym observation received by the agent
         """
-        super().reset()
-        g2op_obs = self.init_env.get_obs()
-        reward = self.init_env.reward_range[0]
-        done = False
-        info = {}
-        g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
-        gym_obs = self.observation_space.to_gym(g2op_obs)
+        done = True
+        while done:
+            super().reset()  # reset the scenario
+            g2op_obs = self.init_env.get_obs()  # retrieve the observation
+            reward = self.init_env.reward_range[0]  # the reward at first step is always minimal
+            info = {}  # no extra information provided !
+            
+            # perform the "heuristics" steps
+            g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
+            # convert back the observation to gym
+            gym_obs = self.observation_space.to_gym(g2op_obs)
         return gym_obs
     
 class GymEnvWithReco(GymEnvWithHeuristics):
@@ -312,5 +316,4 @@ def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
         elif g2op_obs.rho.max() <= self._safe_max_rho:
             # play do nothing if there is "no problem" according to the "rule of thumb"
             res = [self.init_env.action_space()]
-            
         return res

From 95105da33b6ce2e60756637be457436fc6ff4738 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Wed, 30 Mar 2022 17:28:31 +0200
Subject: [PATCH 35/56] some improvments using new environment

---
 examples/ppo_stable_baselines/A_prep_env.py   | 35 +++++++++++--------
 .../ppo_stable_baselines/B_train_agent.py     | 31 ++++++++++------
 .../C_evaluate_trained_model.py               | 32 ++++++++++-------
 examples/ppo_stable_baselines/ReadMe.md       | 10 +++++-
 4 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index c2f7b7b..6631b39 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -14,15 +14,18 @@
 import grid2op
 from grid2op.dtypes import dt_int
 from grid2op.Agent import RecoPowerlineAgent
-from grid2op.utils import ScoreICAPS2021, EpisodeStatistics
+from grid2op.utils import ScoreL2RPN2020, ScoreICAPS2021, EpisodeStatistics
 from lightsim2grid import LightSimBackend
 import numpy as np
 
 is_windows = sys.platform.startswith("win32")
 
 env_name = "l2rpn_icaps_2021_small"
+env_name = "l2rpn_wcci_2022_dev"
+SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
+
 name_stats = "_reco_powerline"
-nb_process_stats = 8 if not is_windows else 1
+nb_process_stats = 4 if not is_windows else 1
 verbose = 1
 deep_copy = is_windows  # force the deep copy on windows (due to permission issue in symlink in windows)
 
@@ -33,9 +36,13 @@ def _aux_get_env(env_name, dn=True, name_stat=None):
     if not os.path.exists(path_env):
         raise RuntimeError(f"The environment \"{env_name}\" does not exist.")
     
-    path_dn = os.path.join(path_env, "_statistics_icaps2021_dn")
+    if SCOREUSED == ScoreICAPS2021:
+        path_dn = os.path.join(path_env, "_statistics_icaps2021_dn")
+    else:
+        path_dn = os.path.join(path_env, "_statistics_l2rpn_dn")
+        
     if not os.path.exists(path_dn):
-        raise RuntimeError("The folder _statistics_icaps2021_dn used for computing the score do not exist")
+        raise RuntimeError("The folder _statistics_icaps2021_dn (or _statistics_l2rpn_dn) used for computing the score do not exist")
     path_reco = os.path.join(path_env, "_statistics_l2rpn_no_overflow_reco")
     if not os.path.exists(path_reco):
         raise RuntimeError("The folder _statistics_l2rpn_no_overflow_reco used for computing the score do not exist")
@@ -107,16 +114,16 @@ def get_env_seed(env_name: str):
         env_tmp = grid2op.make(nm_, backend=LightSimBackend())
         nb_scenario = len(env_tmp.chronics_handler.subpaths)
         print(f"{nm_}: {nb_scenario}")
-        my_score = ScoreICAPS2021(env_tmp,
-                                  nb_scenario=nb_scenario,
-                                  env_seeds=np.random.randint(low=0,
-                                                              high=max_int,
-                                                              size=nb_scenario,
-                                                              dtype=dt_int),
-                                  agent_seeds=[0 for _ in range(nb_scenario)],
-                                  verbose=verbose,
-                                  nb_process_stats=nb_process_stats,
-                                  )
+        my_score = SCOREUSED(env_tmp,
+                             nb_scenario=nb_scenario,
+                             env_seeds=np.random.randint(low=0,
+                                                         high=max_int,
+                                                         size=nb_scenario,
+                                                         dtype=dt_int),
+                             agent_seeds=[0 for _ in range(nb_scenario)],
+                             verbose=verbose,
+                             nb_process_stats=nb_process_stats,
+                             )
 
         # compute statistics for reco powerline
         seeds = get_env_seed(nm_)
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 6fba6d9..06bdd6b 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -19,11 +19,12 @@
 from l2rpn_baselines.utils import GymEnvWithReco, GymEnvWithRecoWithDN
 
 env_name = "l2rpn_icaps_2021_small_train"
+env_name = "l2rpn_wcci_2022_dev_train"
 save_path = "./saved_model"
-name = "expe_GymEnvWithRecoWithDN_sqrt"
-gymenv_class = GymEnvWithRecoWithDN
+name = "expe_GymEnvWithRecoWithDN_2022_test4"
+gymenv_class = GymEnvWithRecoWithDN  # uses the heuristic to do nothing is the grid is not at risk and to reconnect powerline automatically
 max_iter = 7 * 24 * 12  # None to deactivate it
-safe_max_rho = 0.75
+safe_max_rho = 0.9  # the grid is said "safe" if the rho is lower than this value, it is a really important parameter to tune !
 
 
 # customize the reward function (optional)
@@ -58,6 +59,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
         if is_done:
             res = np.sqrt(env.nb_time_step / env.max_episode_duration())
             print(f"{os.path.split(env.chronics_handler.get_id())[-1]}: {env.nb_time_step = }, reward : {res:.3f}")
+            if env.nb_time_step <= 5:
+                print(f"reason game over: {env.infos['exception']}")
             # episode is over => 2 cases
             # if env.nb_time_step == env.max_episode_duration():
             #     return self.reward_max
@@ -108,14 +111,22 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     from lightsim2grid import LightSimBackend  # highly recommended !
     from grid2op.Chronics import MultifolderWithCache  # highly recommended for training
     
-    obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour",
-                        "gen_p", "load_p", "p_or",
+    # you can change below (full list at https://grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes)
+    obs_attr_to_keep = ["month", "day_of_week", "hour_of_day", "minute_of_hour",
+                        "gen_p", "load_p", 
+                        "p_or", "rho", "timestep_overflow", "line_status",
+                        # dispatch part of the observation
                         "actual_dispatch", "target_dispatch",
-                        "rho", "timestep_overflow", "line_status",
-                        "curtailment", "gen_p_before_curtail"]
-
-    act_attr_to_keep = ["redispatch", "curtail"]
-    nb_iter = 1_000_000
+                        # storage part of the observation
+                        "storage_charge", "storage_power",
+                        # curtailment part of the observation
+                        "curtailment", "curtailment_limit",  "gen_p_before_curtail",
+                        ]
+    # same here you can change it as you please
+    act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
+    
+    # parameters for the learning
+    nb_iter = 300_000
     learning_rate = 3e-4
     net_arch = [200, 200, 200, 200]
     gamma = 0.999
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index d7998d6..390eeb6 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -10,31 +10,37 @@
 import numpy as np
 
 import grid2op
-from grid2op.utils import ScoreICAPS2021
+from grid2op.utils import ScoreICAPS2021, ScoreL2RPN2020
 from lightsim2grid import LightSimBackend
 from grid2op.gym_compat import GymEnv
 
 from l2rpn_baselines.PPO_SB3 import evaluate
 
 from A_prep_env import _aux_get_env, get_env_seed, name_stats
-from B_train_agent import gymenv_class, name
+from B_train_agent import gymenv_class, name, safe_max_rho
 
 env_name = "l2rpn_icaps_2021_small_val"
+env_name = "l2rpn_wcci_2022_dev_val"
+SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
+
 agent_name = name
-nb_scenario = 25
+nb_scenario = 10
 nb_process_stats = 1
 load_path = "./saved_model"
-iter_num = 100_000  # put None for the latest version
+iter_num = None  # put None for the latest version
 verbose = True
 
 
-def load_agent(env, load_path, name, gymenv_class=GymEnv):
+def load_agent(env, load_path, name,
+               gymenv_class=gymenv_class,
+               gymenv_kwargs={"safe_max_rho": safe_max_rho}):
     trained_agent, _ = evaluate(env,
                                 nb_episode=0,
                                 load_path=load_path,
                                 name=name,
                                 gymenv_class=gymenv_class,
-                                iter_num=iter_num)
+                                iter_num=iter_num,
+                                gymenv_kwargs=gymenv_kwargs)
     return trained_agent
 
 
@@ -68,13 +74,13 @@ def get_ts_survived_reco(env_name):
     dn_ts_survived = get_ts_survived_dn(env_name)
     reco_ts_survived = get_ts_survived_reco(env_name)
 
-    my_score = ScoreICAPS2021(env_val,
-                              nb_scenario=nb_scenario,
-                              env_seeds=get_env_seed(env_name)[:nb_scenario],
-                              agent_seeds=[0 for _ in range(nb_scenario)],
-                              verbose=verbose,
-                              nb_process_stats=nb_process_stats,
-                              )
+    my_score = SCOREUSED(env_val,
+                         nb_scenario=nb_scenario,
+                         env_seeds=get_env_seed(env_name)[:nb_scenario],
+                         agent_seeds=[0 for _ in range(nb_scenario)],
+                         verbose=verbose,
+                         nb_process_stats=nb_process_stats,
+                         )
 
     my_agent = load_agent(env_val, load_path=load_path, name=agent_name, gymenv_class=gymenv_class)
     _, ts_survived, _ = my_score.get(my_agent)
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
index 2745987..4185b0b 100644
--- a/examples/ppo_stable_baselines/ReadMe.md
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -29,8 +29,16 @@ In this phase, we do 3 things:
 
 ## 2 Training the agent
 
-In this phase TODO
+In this phase where the training takes place and is implemented in the script `B_train_agent.py`
 
+This script will show you how to modify the reward function (if needed), how to select some part of the observation and the action space to train a `PPO` using the "stable baselines 3" framework. This agent only uses **continuous** action types (`redispatching`, `curtailment` and action on `storage units`) and does not modify the topology at all.
+
+This script leverage the most common pattern used by best performing submissions at previous l2rpn competitions and allows you to train agents using some "heuristics" (*eg* "do not act when the grid is safe" or "reconnect a powerline as soon as you can"). This is made possible by the implementation of such "heursitics" directly in the environment: the neural network (agent) only gets observations when it should do something. Said differently, when a heuristic can operate the grid, the NN is "skipped" and does not even sees the observation. At inference time, the same mechanism is used. This makes the training and the evaluation consistent with one another.
+
+This also means that the number of steps performed by grid2op is higher than the number of observations seen by the agent. The training can take a long time.
+
+
+What is of particular importance in this script, beside the usual "learning rate" and "neural network architecture" is the "`safe_max_rho`" meta parameters. This parameters controls when the agent is asked to perform an action (when any `obs.rho >= safe_max_rho`). If it's too high, then the agent will almost never act and might not learn anything. If it's too low then the "heuristic" part ("do nothing when the grid is safe") will not be used and the agent might take a lot of time to learn this.
 
 ## 3 evaluate the agent
 

From cb3a2e0cfc4050743f65698de0392800cf692ccd Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Fri, 22 Apr 2022 15:03:19 +0200
Subject: [PATCH 36/56] introducing basic compatibility with 'new' openai reset
 api

---
 l2rpn_baselines/utils/gymenv_custom.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index 9088305..bd4f5f6 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -223,7 +223,7 @@ def step(self, gym_action):
         gym_obs = self.observation_space.to_gym(g2op_obs)
         return gym_obs, float(reward), done, info
         
-    def reset(self):
+    def reset(self, seed=None, return_info=False, options=None):
         """This function implements the "reset" function. It is called at the end of every episode and
         marks the beginning of a new one.
         
@@ -239,17 +239,22 @@ def reset(self):
             The first open ai gym observation received by the agent
         """
         done = True
+        info = {}  # no extra information provided !
         while done:
-            super().reset()  # reset the scenario
+            super().reset(seed, return_info, options)  # reset the scenario
             g2op_obs = self.init_env.get_obs()  # retrieve the observation
             reward = self.init_env.reward_range[0]  # the reward at first step is always minimal
-            info = {}  # no extra information provided !
             
             # perform the "heuristics" steps
             g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, False, info)
+            
             # convert back the observation to gym
             gym_obs = self.observation_space.to_gym(g2op_obs)
-        return gym_obs
+            
+        if return_info:
+            return gym_obs, info
+        else:
+            return gym_obs
     
 class GymEnvWithReco(GymEnvWithHeuristics):
     """This specific type of environment with "heuristics" / "expert rules" / "expert actions" is an

From 44dd19876ea138f1b175b567a51ad93ee029d05d Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Tue, 3 May 2022 18:16:48 +0200
Subject: [PATCH 37/56] adding an optimizer baseline, mark some baseline as
 deprecated in the doc

---
 .gitignore                                    |   3 +-
 .readthedocs.yml                              |   2 +-
 docs/deepqsimple.rst                          |   8 +
 docs/doubleduelingdqn.rst                     |   9 +-
 docs/doubleduelingrdqn.rst                    |   9 +-
 docs/duelqleapnet.rst                         |   8 +
 docs/duelqsimple.rst                          |   8 +
 docs/index.rst                                |   1 +
 docs/leapnetencoded.rst                       |   8 +
 docs/optimcvxpy.rst                           |  69 ++
 docs/sacold.rst                               |   8 +
 l2rpn_baselines/DeepQSimple/__init__.py       |   8 +
 l2rpn_baselines/DeepQSimple/deepQSimple.py    |   9 +
 l2rpn_baselines/DeepQSimple/deepQ_NN.py       |  10 +-
 l2rpn_baselines/DeepQSimple/deepQ_NNParam.py  |   8 +
 l2rpn_baselines/DeepQSimple/evaluate.py       |   7 +
 l2rpn_baselines/DeepQSimple/train.py          |   8 +
 .../doubleDuelingDQNConfig.py                 |   9 +
 .../DoubleDuelingDQN/doubleDuelingDQN_NN.py   |  12 +-
 .../DoubleDuelingRDQN/doubleDuelingRDQN.py    |  10 +
 .../doubleDuelingRDQNConfig.py                |   9 +
 .../DoubleDuelingRDQN/doubleDuelingRDQN_NN.py |  10 +
 l2rpn_baselines/DoubleDuelingRDQN/evaluate.py |   9 +
 l2rpn_baselines/DoubleDuelingRDQN/train.py    |   9 +
 l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py  |   9 +
 .../DuelQLeapNet/duelQLeapNet_NN.py           |   9 +
 l2rpn_baselines/DuelQLeapNet/evaluate.py      |   9 +
 .../DuelQLeapNet/leapNet_NNParam.py           |   8 +
 l2rpn_baselines/DuelQLeapNet/train.py         |   8 +
 l2rpn_baselines/DuelQSimple/duelQSimple.py    |   9 +
 l2rpn_baselines/DuelQSimple/duelQ_NN.py       |  12 +-
 l2rpn_baselines/DuelQSimple/duelQ_NNParam.py  |   9 +
 l2rpn_baselines/DuelQSimple/evaluate.py       |   8 +
 l2rpn_baselines/DuelQSimple/train.py          |   8 +
 l2rpn_baselines/Kaist                         |   2 +-
 l2rpn_baselines/LeapNetEncoded/evaluate.py    |   8 +
 .../LeapNetEncoded/leapNetEncoded.py          |   9 +
 .../LeapNetEncoded/leapNetEncoded_NN.py       |   8 +
 .../LeapNetEncoded/leapNetEncoded_NNParam.py  |   8 +
 l2rpn_baselines/LeapNetEncoded/study.py       |  13 +-
 l2rpn_baselines/LeapNetEncoded/train.py       |   7 +
 l2rpn_baselines/OptimCVXPY/__init__.py        |  15 +
 l2rpn_baselines/OptimCVXPY/evaluate.py        |  11 +
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py      | 780 ++++++++++++++++++
 l2rpn_baselines/README.md                     |  59 +-
 l2rpn_baselines/SACOld/evaluate.py            |   8 +
 l2rpn_baselines/SACOld/sacOld.py              |   8 +
 l2rpn_baselines/SACOld/sacOld_NN.py           |   8 +
 l2rpn_baselines/SACOld/sacOld_NNParam.py      |   8 +
 l2rpn_baselines/SACOld/train.py               |   8 +
 l2rpn_baselines/SliceRDQN/evaluate.py         |  10 +-
 l2rpn_baselines/SliceRDQN/sliceRDQN.py        |  10 +
 l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py     |  10 +
 l2rpn_baselines/SliceRDQN/train.py            |  10 +
 l2rpn_baselines/__init__.py                   |  11 +-
 l2rpn_baselines/test/test_optimcvxpy.py       | 199 +++++
 l2rpn_baselines/utils/baseDeepQ.py            |  12 +
 l2rpn_baselines/utils/deepQAgent.py           |  12 +
 l2rpn_baselines/utils/nnParam.py              |  12 +
 l2rpn_baselines/utils/rlAgent.py              |  83 --
 l2rpn_baselines/utils/train_generic.py        |  12 +
 l2rpn_baselines/utils/trainingParam.py        |  12 +
 setup.py                                      |   5 +-
 63 files changed, 1562 insertions(+), 136 deletions(-)
 create mode 100644 docs/optimcvxpy.rst
 create mode 100644 l2rpn_baselines/OptimCVXPY/__init__.py
 create mode 100644 l2rpn_baselines/OptimCVXPY/evaluate.py
 create mode 100644 l2rpn_baselines/OptimCVXPY/optimCVXPY.py
 create mode 100644 l2rpn_baselines/test/test_optimcvxpy.py
 delete mode 100644 l2rpn_baselines/utils/rlAgent.py

diff --git a/.gitignore b/.gitignore
index ae1de2e..c835312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -188,4 +188,5 @@ saved_agents/
 test_train_DuelQSimple.py
 line_act.json
 tensorboard/
-test_sac/
\ No newline at end of file
+test_sac/
+documentation/
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 679eaaa..6f2d283 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,7 +1,7 @@
 version: 2
 
 python:
-    version: 3.6
+    version: 3.8
     install:
        - method: pip
          path: .
diff --git a/docs/deepqsimple.rst b/docs/deepqsimple.rst
index 1875222..096e0fd 100644
--- a/docs/deepqsimple.rst
+++ b/docs/deepqsimple.rst
@@ -10,6 +10,14 @@ baseline. Don't expect to obtain state of the art method with this simple method
 
 An example to train this model is available in the train function :ref:`Example-deepqsimple`
 
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Exported class
 --------------
 You can use this class with:
diff --git a/docs/doubleduelingdqn.rst b/docs/doubleduelingdqn.rst
index 5ac6a50..4df7c0d 100644
--- a/docs/doubleduelingdqn.rst
+++ b/docs/doubleduelingdqn.rst
@@ -10,7 +10,14 @@ This baseline is of type Double Duelling Deep Q Network, as in Duelling Q Networ
 
 It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results.
 
-
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Agent class
 ------------------------
 You can use this class with:
diff --git a/docs/doubleduelingrdqn.rst b/docs/doubleduelingrdqn.rst
index 143bd40..1b9c820 100644
--- a/docs/doubleduelingrdqn.rst
+++ b/docs/doubleduelingrdqn.rst
@@ -10,7 +10,14 @@ This baseline is of type Recurrent Double Duelling Deep Q Network, as in Duellin
 
 It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results.
 
-
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Agent class
 ------------------------
 You can use this class with:
diff --git a/docs/duelqleapnet.rst b/docs/duelqleapnet.rst
index cce7bcf..3a8f88f 100644
--- a/docs/duelqleapnet.rst
+++ b/docs/duelqleapnet.rst
@@ -17,6 +17,14 @@ In this baseline, we use this very same architecture to model the Q function. Th
 
 An example to train this model is available in the train function :ref:`Example-leapnet`.
 
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Exported class
 --------------
 You can use this class with:
diff --git a/docs/duelqsimple.rst b/docs/duelqsimple.rst
index 9c6af51..175dcfa 100644
--- a/docs/duelqsimple.rst
+++ b/docs/duelqsimple.rst
@@ -12,6 +12,14 @@ baseline. Don't expect to obtain state of the art method with this simple method
 
 An example to train this model is available in the train function :ref:`Example-duelqsimple`.
 
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Exported class
 --------------
 You can use this class with:
diff --git a/docs/index.rst b/docs/index.rst
index dab24fc..dd7c839 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -48,6 +48,7 @@ in some environment.
    :caption: Open source libraries
 
    expertagent
+   optimcvxpy
    external_contributions
 
 Custom implementation
diff --git a/docs/leapnetencoded.rst b/docs/leapnetencoded.rst
index 852c909..ea8a9a2 100644
--- a/docs/leapnetencoded.rst
+++ b/docs/leapnetencoded.rst
@@ -21,6 +21,14 @@ a leap net) that parametrized the Q function.
 
 An example to train this model is available in the train function :ref:`Example-leapnetenc`.
 
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Exported class
 --------------
 You can use this class with:
diff --git a/docs/optimcvxpy.rst b/docs/optimcvxpy.rst
new file mode 100644
index 0000000..c39d1cf
--- /dev/null
+++ b/docs/optimcvxpy.rst
@@ -0,0 +1,69 @@
+.. currentmodule:: l2rpn_baselines.OptimCVXPY
+
+OptimCVXPY: A example implementation of an agent based on an optimizer
+=======================================================================
+
+.. note::
+    This "baseline" uses optimization and only performs (using optimization) actions on
+    continuous variables.
+
+    If you want a more general agent, you can use the:
+    
+    - :mod:`l2rpn_baselines.ExpertAgent.ExpertAgent` to perform actions on discrete variables 
+      (especially the topology) using some heuristics
+    - `grid2op_milp_agent <https://github.com/rte-france/grid2op-milp-agent>`_ that also uses
+      an optimization package (in this case google "or-tools") to perform topological
+      actions. The integration of this baseline in l2rpn-baselines is in progress.
+
+Description
+-----------
+This agent choses its action by resolving, at each `agent.act(...)` call an optimization routine
+that is then converted to a grid2op action.
+
+It has 3 main behaviours:
+
+- `safe grid`: when the grid is safe, it tries to get back to an "original" state. It will
+  gradually cancel all past redispatching and curtailment action and aim at a storage state
+  of charge close to `0.5 * Emax` for all storage units. If the grid is safe this agent can
+  also take some actions to reconnect powerlines.
+- `unsafe grid`: when the grid is unsafe, it tries to set it back to a "safe" state (all flows
+  below their thermal limit) by optimizing storage units, curtailment and redispatching only.
+  (This agent does not perform topological actions in this state)
+- `intermediate grid`: in this state the agent does nothing. This state is mainly present
+  to avoid this agent to "oscillate" between safe and unsafe states.
+
+The "behaviours" in which the agent is in depends on the maximum power flow (in percent)
+of the grid. If the maximum power flow is below a certain threshold (`rho_safe`), the agent is in the
+"safe grid" state. If the maximum power flow is above a certain threshold (`rho_danger`), the agent is in
+"unsafe grid" state.
+
+This agent adopts the DC approximation in its optimization routine. In the 
+current formulation, it is "greedy" and does not "look ahead", though it would be possible.
+
+safe grid
+++++++++++
+TODO: explain the optimization problem solved!
+
+unsafe grid
+++++++++++++
+The goal in this case is to get back  in a safe state as quickly as possible.
+
+To that end, the agent will minimize the violation of thermal limit. To avoid undesired behaviour
+where the agent would do too much redispatching / curtailment / storage (by saturating 
+its constraints for example) you have the possibility also to add a penalty of such in
+the optimization problem with the parameters `penalty_curtailment`, `penalty_redispatching` and	
+`penalty_storage`.
+
+Agent class
+-----------
+This agent does not train, it is only a closed system analysis to help decision making to solve an overload.
+You can use this class with:
+
+.. code-block:: python
+
+    from l2rpn_baselines.OptimCVXPY import OptimCVXPY
+    from l2rpn_baselines.OptimCVXPY import evaluate
+
+.. automodule:: l2rpn_baselines.OptimCVXPY.OptimCVXPY
+    :members:
+    :autosummary:
diff --git a/docs/sacold.rst b/docs/sacold.rst
index 8719892..ede48e1 100644
--- a/docs/sacold.rst
+++ b/docs/sacold.rst
@@ -16,6 +16,14 @@ backward compatibility with earlier version (< 0.5.0) of this package**
 
 An example to train this model is available in the train function :ref:`Example-sacold`.
 
+.. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
 Exported class
 --------------
 You can use this class with:
diff --git a/l2rpn_baselines/DeepQSimple/__init__.py b/l2rpn_baselines/DeepQSimple/__init__.py
index 686ced0..0cf72aa 100644
--- a/l2rpn_baselines/DeepQSimple/__init__.py
+++ b/l2rpn_baselines/DeepQSimple/__init__.py
@@ -1,3 +1,11 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
 __all__ = [
     "DeepQSimple",
     "evaluate",
diff --git a/l2rpn_baselines/DeepQSimple/deepQSimple.py b/l2rpn_baselines/DeepQSimple/deepQSimple.py
index a278fc1..dc8b788 100644
--- a/l2rpn_baselines/DeepQSimple/deepQSimple.py
+++ b/l2rpn_baselines/DeepQSimple/deepQSimple.py
@@ -13,5 +13,14 @@
 class DeepQSimple(DeepQAgent):
     """
     A simple deep q learning algorithm. It does nothing different thant its base class.
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
     pass
diff --git a/l2rpn_baselines/DeepQSimple/deepQ_NN.py b/l2rpn_baselines/DeepQSimple/deepQ_NN.py
index 2405e38..8698462 100644
--- a/l2rpn_baselines/DeepQSimple/deepQ_NN.py
+++ b/l2rpn_baselines/DeepQSimple/deepQ_NN.py
@@ -25,7 +25,15 @@
 class DeepQ_NN(BaseDeepQ):
     """
     Constructs the desired deep q learning network
-
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Attributes
     ----------
     schedule_lr_model:
diff --git a/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py b/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
index f7d136e..249c0b7 100644
--- a/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
+++ b/l2rpn_baselines/DeepQSimple/deepQ_NNParam.py
@@ -17,6 +17,14 @@ class DeepQ_NNParam(NNParam):
     
     Nothing really different compared to the base class
     except that :attr:`l2rpn_baselines.utils.NNParam.nn_class` (nn_class) is :class:`deepQ_NN.DeepQ_NN`
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    
     """
     _int_attr = copy.deepcopy(NNParam._int_attr)
     _float_attr = copy.deepcopy(NNParam._float_attr)
diff --git a/l2rpn_baselines/DeepQSimple/evaluate.py b/l2rpn_baselines/DeepQSimple/evaluate.py
index 6f2506a..fa98bc4 100644
--- a/l2rpn_baselines/DeepQSimple/evaluate.py
+++ b/l2rpn_baselines/DeepQSimple/evaluate.py
@@ -38,6 +38,13 @@ def evaluate(env,
     """
     How to evaluate the performances of the trained :class:`DeepQSimple` agent.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
 
     Parameters
     ----------
diff --git a/l2rpn_baselines/DeepQSimple/train.py b/l2rpn_baselines/DeepQSimple/train.py
index 7d1db21..99257b5 100755
--- a/l2rpn_baselines/DeepQSimple/train.py
+++ b/l2rpn_baselines/DeepQSimple/train.py
@@ -33,6 +33,14 @@ def train(env,
     """
     This function implements the "training" part of the balines "DeepQSimple".
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py
index 94d38af..b25d74d 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQNConfig.py
@@ -13,6 +13,15 @@ class DoubleDuelingDQNConfig():
     """
     DoubleDuelingDQN configurable hyperparameters
     exposed as class attributes
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
 
     LR_DECAY_STEPS = 1024*64
diff --git a/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py
index ef3acd3..615fb4e 100644
--- a/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py
+++ b/l2rpn_baselines/DoubleDuelingDQN/doubleDuelingDQN_NN.py
@@ -19,7 +19,17 @@
 
 
 class DoubleDuelingDQN_NN(object):
-    """Constructs the desired deep q learning network"""
+    """Constructs the desired deep q learning network
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     def __init__(self,
                  action_size,
                  observation_size,                 
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
index 602c9e9..e1770c6 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN.py
@@ -25,6 +25,16 @@
 from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN_NN import DoubleDuelingRDQN_NN
 
 class DoubleDuelingRDQN(AgentWithConverter):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     def __init__(self,
                  observation_space,
                  action_space,
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py
index 45b384d..b4d5313 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQNConfig.py
@@ -11,6 +11,15 @@
 class DoubleDuelingRDQNConfig():
     """
     DoubleDuelingRDQN configurable hyperparameters as class attributes
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
 
     INITIAL_EPSILON = 0.99
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py
index f0ca7d1..31b7064 100644
--- a/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/doubleDuelingRDQN_NN.py
@@ -22,6 +22,16 @@
 
 
 class DoubleDuelingRDQN_NN(object):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     def __init__(self,
                  action_size,
                  observation_size,
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
index 66664f5..a759423 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/evaluate.py
@@ -59,6 +59,15 @@ def evaluate(env,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=DEFAULT_VERBOSE,
              save_gif=False):
+    '''
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    '''
     import tensorflow as tf  # lazy import to save import time
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/DoubleDuelingRDQN/train.py b/l2rpn_baselines/DoubleDuelingRDQN/train.py
index 8831b6b..2937a18 100755
--- a/l2rpn_baselines/DoubleDuelingRDQN/train.py
+++ b/l2rpn_baselines/DoubleDuelingRDQN/train.py
@@ -74,6 +74,15 @@ def train(env,
           batch_size=DEFAULT_BATCH_SIZE,
           learning_rate=DEFAULT_LR,
           verbose=DEFAULT_VERBOSE):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    """
     import tensorflow as tf  # lazy import to save import time
     
     # Set config
diff --git a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
index b61351d..d029890 100644
--- a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
+++ b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet.py
@@ -19,5 +19,14 @@ class DuelQLeapNet(DeepQAgent):
     Double Duelling Deep Q network baseline, with the particularity that the Q network is encoded with a leap net.
 
     It does nothing in particular.
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
     pass
diff --git a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
index a0f8ff8..26b76bd 100644
--- a/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
+++ b/l2rpn_baselines/DuelQLeapNet/duelQLeapNet_NN.py
@@ -95,6 +95,15 @@ class DuelQLeapNet_NN(BaseDeepQ):
     """
     Constructs the desired duelling deep q learning network with a leap neural network as a modeling
     of the q function
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
     def __init__(self,
                  nn_params,
diff --git a/l2rpn_baselines/DuelQLeapNet/evaluate.py b/l2rpn_baselines/DuelQLeapNet/evaluate.py
index 4e4a11d..20f1dfe 100644
--- a/l2rpn_baselines/DuelQLeapNet/evaluate.py
+++ b/l2rpn_baselines/DuelQLeapNet/evaluate.py
@@ -37,6 +37,15 @@ def evaluate(env,
     """
     How to evaluate the performances of the trained :class:`DuelQLeapNet` agent.
 
+
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py b/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
index 736a983..baf4a2e 100644
--- a/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
+++ b/l2rpn_baselines/DuelQLeapNet/leapNet_NNParam.py
@@ -18,6 +18,14 @@ class LeapNet_NNParam(NNParam):
 
     More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Attributes
     -----------
     x_dim: ``int``
diff --git a/l2rpn_baselines/DuelQLeapNet/train.py b/l2rpn_baselines/DuelQLeapNet/train.py
index e8d8355..e0f81d6 100755
--- a/l2rpn_baselines/DuelQLeapNet/train.py
+++ b/l2rpn_baselines/DuelQLeapNet/train.py
@@ -33,6 +33,14 @@ def train(env,
     """
     This function implements the "training" part of the balines :class:`DuelQLeapNet`.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/DuelQSimple/duelQSimple.py b/l2rpn_baselines/DuelQSimple/duelQSimple.py
index b1be20c..2f7610b 100644
--- a/l2rpn_baselines/DuelQSimple/duelQSimple.py
+++ b/l2rpn_baselines/DuelQSimple/duelQSimple.py
@@ -16,5 +16,14 @@ class DuelQSimple(DeepQAgent):
     Double Duelling Deep Q network baseline.
 
     It does nothing in particular.
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
     pass
diff --git a/l2rpn_baselines/DuelQSimple/duelQ_NN.py b/l2rpn_baselines/DuelQSimple/duelQ_NN.py
index f264cd4..d4d7895 100644
--- a/l2rpn_baselines/DuelQSimple/duelQ_NN.py
+++ b/l2rpn_baselines/DuelQSimple/duelQ_NN.py
@@ -24,7 +24,17 @@
 
 
 class DuelQ_NN(BaseDeepQ):
-    """Constructs the desired duelling deep q learning network"""
+    """Constructs the desired duelling deep q learning network
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    
+    """
     def __init__(self,
                  nn_params,
                  training_param=None):
diff --git a/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py b/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
index 8a40234..4e8ca48 100644
--- a/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
+++ b/l2rpn_baselines/DuelQSimple/duelQ_NNParam.py
@@ -13,6 +13,15 @@
 
 
 class DuelQ_NNParam(NNParam):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    """
     _int_attr = copy.deepcopy(NNParam._int_attr)
     _float_attr = copy.deepcopy(NNParam._float_attr)
     _str_attr = copy.deepcopy(NNParam._str_attr)
diff --git a/l2rpn_baselines/DuelQSimple/evaluate.py b/l2rpn_baselines/DuelQSimple/evaluate.py
index 7825394..2b3cc4b 100644
--- a/l2rpn_baselines/DuelQSimple/evaluate.py
+++ b/l2rpn_baselines/DuelQSimple/evaluate.py
@@ -38,6 +38,14 @@ def evaluate(env,
     """
     How to evaluate the performances of the trained DuelQSimple agent.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/DuelQSimple/train.py b/l2rpn_baselines/DuelQSimple/train.py
index fa806b1..d0adf94 100755
--- a/l2rpn_baselines/DuelQSimple/train.py
+++ b/l2rpn_baselines/DuelQSimple/train.py
@@ -34,6 +34,14 @@ def train(env,
     """
     This function implements the "training" part of the balines "DuelQSimple".
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/Kaist b/l2rpn_baselines/Kaist
index b2b6561..71c49e7 160000
--- a/l2rpn_baselines/Kaist
+++ b/l2rpn_baselines/Kaist
@@ -1 +1 @@
-Subproject commit b2b6561a2cc3afbf03fd13ef6d1b334e4ec6c98a
+Subproject commit 71c49e73ace272fd6d8258a5295abc2b8d3bea1b
diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py
index 158019b..8d1690e 100644
--- a/l2rpn_baselines/LeapNetEncoded/evaluate.py
+++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py
@@ -39,6 +39,14 @@ def evaluate(env,
     """
     How to evaluate the performances of the trained DeepQSimple agent.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
index 81664bb..516768e 100644
--- a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded.py
@@ -20,5 +20,14 @@ class LeapNetEncoded(DeepQAgent):
     that the Q network is encoded with a leap net.
 
     It does nothing in particular.
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     """
     pass
diff --git a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
index feee2d6..4170875 100644
--- a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NN.py
@@ -32,6 +32,14 @@
 
 class LeapNetEncoded_NN(BaseDeepQ):
     """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Constructs the desired neural networks.
 
     More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
diff --git a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
index 713e2dc..12619d5 100644
--- a/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
+++ b/l2rpn_baselines/LeapNetEncoded/leapNetEncoded_NNParam.py
@@ -17,6 +17,14 @@ class LeapNetEncoded_NNParam(NNParam):
     """
     This class implements the type of parameters used by the :class:`LeapNetEncoded` model.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
 
     Attributes
diff --git a/l2rpn_baselines/LeapNetEncoded/study.py b/l2rpn_baselines/LeapNetEncoded/study.py
index 7059a8c..c274ec0 100644
--- a/l2rpn_baselines/LeapNetEncoded/study.py
+++ b/l2rpn_baselines/LeapNetEncoded/study.py
@@ -35,7 +35,18 @@ def study(env,
           max_steps=DEFAULT_MAX_STEPS,
           verbose=False,
           save_gif=False):
-    """study the prediction of the grid_model"""
+    """
+    study the prediction of the grid_model
+    
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
 
     import tensorflow as tf
     # Limit gpu usage
diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py
index 6e7dc91..1626b40 100755
--- a/l2rpn_baselines/LeapNetEncoded/train.py
+++ b/l2rpn_baselines/LeapNetEncoded/train.py
@@ -34,6 +34,13 @@ def train(env,
     """
     This function implements the "training" part of the baselines :class:`LeapNetEncoded`.
     
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
 
     Parameters
     ----------
diff --git a/l2rpn_baselines/OptimCVXPY/__init__.py b/l2rpn_baselines/OptimCVXPY/__init__.py
new file mode 100644
index 0000000..34e8275
--- /dev/null
+++ b/l2rpn_baselines/OptimCVXPY/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+__all__ = [
+    "evaluate",
+    "OptimCVXPY"
+]
+
+from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
+from l2rpn_baselines.OptimCVXPY.evaluate import evaluate
diff --git a/l2rpn_baselines/OptimCVXPY/evaluate.py b/l2rpn_baselines/OptimCVXPY/evaluate.py
new file mode 100644
index 0000000..bf20d6b
--- /dev/null
+++ b/l2rpn_baselines/OptimCVXPY/evaluate.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+def evaluate():
+    # TODO !
+    pass
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
new file mode 100644
index 0000000..a8cb47f
--- /dev/null
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -0,0 +1,780 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+from typing import Optional
+import logging
+import warnings
+import cvxpy as cp
+import numpy as np
+
+import grid2op
+from grid2op.Agent import BaseAgent
+from grid2op.Environment import Environment
+from grid2op.Action import PlayableAction, ActionSpace, BaseAction
+from grid2op.Backend import PandaPowerBackend
+from grid2op.Observation import BaseObservation
+from lightsim2grid import LightSimBackend
+from lightsim2grid.gridmodel import init
+
+import pdb
+
+# TODO: "predictive control"
+# TODO: no flow in constraints but in objective function
+# TODO: reuse previous computations
+class OptimCVXPY(BaseAgent):
+    """
+    This agent choses its action by resolving, at each `agent.act(...)` call an optimization routine
+    that is then converted to a grid2op action.
+
+    It has 3 main behaviours:
+
+    - `safe grid`: when the grid is safe, it tries to get back to an "original" state. It will
+    gradually cancel all past redispatching and curtailment action and aim at a storage state
+    of charge close to `0.5 * Emax` for all storage units. If the grid is safe this agent can
+    also take some actions to reconnect powerlines.
+    - `unsafe grid`: when the grid is unsafe, it tries to set it back to a "safe" state (all flows
+    below their thermal limit) by optimizing storage units, curtailment and redispatching only.
+    (This agent does not perform topological actions in this state)
+    - `intermediate grid`: in this state the agent does nothing. This state is mainly present
+    to avoid this agent to "oscillate" between safe and unsafe states.
+
+    The "behaviours" in which the agent is in depends on the maximum power flow (in percent)
+    of the grid. If the maximum power flow is below a certain threshold (`rho_safe`), the agent is in the
+    "safe grid" state. If the maximum power flow is above a certain threshold (`rho_danger`), the agent is in
+    "unsafe grid" state.
+
+    Have a look at the documentation for more details about the optimization problems
+    solved in each case.
+    
+    
+    Parameters
+    ----------
+    action_space : `grid2op.Action.ActionSpace`
+        The action space of the environment.
+        
+    _powerlines_x: `cp.Parameter`	
+        The reactance of each powerline / transformer in the network given in per unit !
+    
+    _margin_th_limit: `cp.Parameter`
+        In the "unsafe state" this agent will try to minimize the thermal limit violation.
+        
+        A "thermal limit violation" is defined as having a flow (in dc) above 
+        `margin_th_limit * thermal_limit_mw`.
+        
+        The model is particularly sensitive to this parameter.
+        
+    rho_danger: `float`
+        If any `obs.rho` is above `rho_danger`, then the agent will use the
+        "unsafe grid" optimization routine and try to apply curtailment,
+        redispatching and action on storage unit to set back the grid into a safe state.
+        
+    rho_safe: `float`
+        If all `obs.rho` are below `rho_safe`, then the agent will use the
+        "safe grid" optimization routine and try to set back the grid into
+        a reference state.
+    
+    nb_max_bus: `int`	
+        Maximum number of buses allowed in the powergrid.
+    
+    _penalty_curtailment: ` cp.Parameter`
+
+    _penalty_redispatching: `cp.Parameter`
+
+    _penalty_storag: `cp.Parameter`
+        
+    bus_or: `cp.Parameter`
+    
+    bus_ex: `cp.Parameter`
+    
+    bus_load: `cp.Parameter`
+    
+    bus_gen: `cp.Parameter`
+    
+    bus_storage: `cp.Parameter`
+        
+    load_per_bus: `cp.Parameter`
+    
+    gen_per_bus: `cp.Parameter`
+        
+    redisp_up: `cp.Parameter`
+    
+    redisp_down: `cp.Parameter`
+        
+    curtail_down: `cp.Parameter`
+    
+    curtail_up: `cp.Parameter`
+    
+    storage_down: `cp.Parameter`
+    
+    storage_up: `cp.Parameter`
+        
+    th_lim_mw: `cp.Parameter`
+    
+    flow_computed: `np.ndarray`
+    
+    margin_rounding: `float`
+    
+    margin_sparse: `float`
+    
+    logger: `logging.Logger`
+        A logger to log information about the optimization process.
+    
+    """
+    SOLVER_TYPES = [cp.SCS, cp.OSQP, cp.SCIPY]
+    
+    def __init__(self,
+                 action_space : ActionSpace,
+                 env : Environment,
+                 lines_x_pu: Optional[np.array]=None,
+                 margin_th_limit: float=0.9,
+                 rho_danger: float=0.95,
+                 rho_safe: float=0.85,
+                 penalty_curtailment: float=0.1,
+                 penalty_redispatching: float=0.03,
+                 penalty_storage: float=0.3,
+                 margin_rounding: float=0.01,
+                 margin_sparse: float=1e-4,
+                 logger : Optional[logging.Logger]=None) -> None:
+        """Initialize this class
+
+        Parameters
+        ----------
+        action_space : `grid2op.Action.ActionSpace`
+            The action space of the environment.
+            
+        env: `grid2op.Environment.Environment`:
+            The environment in which the agent evolves.
+            
+            If `lines_x_pu` is not provided, then this agent will attempt to read the
+            reactance of each powerlines and transformer from the environment backend.
+            
+        lines_x_pu: `np.ndarray`	
+            The reactance of each powerline / transformer in the network.
+            
+            It is optional and if it's not provided, then the reactance will be read from the
+            environment.
+        
+        margin_th_limit: `float`
+            In the "unsafe state" this agent will try to minimize the thermal limit violation.
+            
+            A "thermal limit violation" is defined as having a flow (in dc) above 
+            `margin_th_limit * thermal_limit_mw`.
+            
+            The model is particularly sensitive to this parameter.
+            
+        rho_danger: `float`
+            If any `obs.rho` is above `rho_danger`, then the agent will use the
+            "unsafe grid" optimization routine and try to apply curtailment,
+            redispatching and action on storage unit to set back the grid into a safe state.
+            
+        rho_safe: `float`
+            If all `obs.rho` are below `rho_safe`, then the agent will use the
+            "safe grid" optimization routine and try to set back the grid into
+            a reference state.
+        
+        penalty_curtailment: `float`
+            The cost of applying a curtailment in the objective function.
+        
+            Default value is 0.1.
+            
+        penalty_redispatching: `float`
+            The cost of applying a redispatching in the objective function.
+            
+            Default value is 0.03.
+        
+        penalty_storage: `float`
+            The cost of applying a storage in the objective function.
+              
+            Default value is 0.3.
+                   
+        margin_rounding: `float`
+            A margin taken to avoid rounding issues that could lead to infeasible
+            actions due to "redispatching above max_ramp_up" for example.
+            
+        margin_sparse: `float`
+            A margin taken when converting the output of the optimization routine
+            to grid2op actions: if some values are below this value, then they are
+            set to zero.
+            
+        logger: `logging.Logger`
+            A logger to log information about the optimization process.
+    
+        Raises
+        ------
+        ValueError
+            If you provide a `lines_x_pu` that is not of the same size as the number of powerlines
+            
+        RuntimeError
+            In case the lines reactance are not provided and cannot 
+            be inferred from the environment.
+            
+        """
+        BaseAgent.__init__(self, action_space)
+        self._margin_th_limit: cp.Parameter = cp.Parameter(value=margin_th_limit,
+                                                           nonneg=True)
+        self._penalty_curtailment: cp.Parameter = cp.Parameter(value=penalty_curtailment,
+                                                               nonneg=True)
+        self._penalty_redispatching: cp.Parameter = cp.Parameter(value=penalty_redispatching,
+                                                                 nonneg=True)
+        self._penalty_storage: cp.Parameter = cp.Parameter(value=penalty_storage,
+                                                           nonneg=True)
+        
+        self.margin_rounding: float = float(margin_rounding)
+        self.margin_sparse: float = float(margin_sparse)
+        self.rho_danger: float = float(rho_danger)
+        self.rho_safe: float = float(rho_safe)
+        
+        if lines_x_pu is not None:
+            powerlines_x = 1.0 * np.array(lines_x_pu).astype(float)
+        elif isinstance(env.backend, LightSimBackend): 
+            # read the powerline x (reactance) from
+            # lightsim grid
+            powerlines_x = np.array(
+                [float(el.x_pu) for el in env.backend._grid.get_lines()] + 
+                [float(el.x_pu) for el in env.backend._grid.get_trafos()]) 
+        elif isinstance(env.backend, PandaPowerBackend):
+            # read the powerline x (reactance) from
+            # pandapower grid
+            pp_net = env.backend._grid
+            grid_model = init(pp_net) 
+            powerlines_x = np.array(
+                [float(el.x_pu) for el in grid_model.get_lines()] + 
+                [float(el.x_pu) for el in grid_model.get_trafos()]) 
+        else:
+            # no powerline information available
+            raise RuntimeError(f"Unkown backend type: {type(env.backend)}. If you want to use "
+                               "OptimCVXPY, you need to provide the reactance of each powerline / "
+                               "transformer in per unit in the `lines_x` parameter.")
+        if powerlines_x.shape[0] != env.n_line:
+            raise ValueError("The number of lines in the grid is not the same as the number "
+                                "of lines in provided lines_x")
+        if np.any(powerlines_x <= 0.):
+            raise ValueError("All powerline reactance must be strictly positive")
+        
+        self._powerlines_x: cp.Parameter = cp.Parameter(shape=powerlines_x.shape,
+                                                        value=1.0 * powerlines_x,
+                                                        pos=True)
+        
+        # TODO replace all below with sparse matrices
+        # to be able to change the topology more easily
+        self.nb_max_bus: int = 2 * env.n_sub
+        self.bus_or: cp.Parameter = cp.Parameter(shape=env.n_line,
+                                                 value=env.line_or_to_subid,
+                                                 integer=True)
+        self.bus_ex: cp.Parameter = cp.Parameter(shape=env.n_line,
+                                                 value=env.line_ex_to_subid,
+                                                 integer=True)
+        self.bus_load: cp.Parameter = cp.Parameter(shape=env.n_load,
+                                                   value=env.load_to_subid,
+                                                   integer=True)
+        self.bus_gen: cp.Parameter = cp.Parameter(shape=env.n_gen,
+                                                  value=env.gen_to_subid,
+                                                  integer=True)
+        self.bus_storage: cp.Parameter = cp.Parameter(shape=env.n_storage,
+                                                      value=env.storage_to_subid,
+                                                      integer=True)
+        
+        this_zeros_ = np.zeros(self.nb_max_bus)
+        self.load_per_bus: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                       value=1.0 * this_zeros_,
+                                                        nonneg=True)
+        self.gen_per_bus: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                      value=1.0 * this_zeros_,
+                                                      nonneg=True)
+        
+        self.redisp_up: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                    value=1.0 * this_zeros_,
+                                                    nonneg=True)
+        self.redisp_down: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                      value=1.0 * this_zeros_,
+                                                      nonneg=True)
+        
+        self.curtail_down: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                       value=1.0 * this_zeros_,
+                                                       nonneg=True)
+        self.curtail_up: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                     value=1.0 * this_zeros_,
+                                                     nonneg=True)
+        
+        self.storage_down: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                       value=1.0 * this_zeros_,
+                                                       nonneg=True)
+        self.storage_up: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
+                                                     value=1.0 * this_zeros_,
+                                                     nonneg=True)
+        
+        self._th_lim_mw: cp.Parameter = cp.Parameter(shape=env.n_line,
+                                                     value=env.get_thermal_limit(),
+                                                     nonneg=True)
+        
+        if logger is None:
+            self.logger: logging.Logger = logging.getLogger(__name__)
+            self.logger.disabled = False
+        else:
+            self.logger: logging.Logger = logger.getChild("OptimCVXPY")
+
+        self.flow_computed = np.zeros(env.n_line, dtype=float)
+        self.flow_computed[:] = np.NaN
+        
+    @property
+    def margin_th_limit(self) -> cp.Parameter:
+        return self._margin_th_limit
+    
+    @margin_th_limit.setter
+    def margin_th_limit(self, val: float):
+        self._margin_th_limit = float(val)
+        
+    @property
+    def penalty_curtailment(self) -> cp.Parameter:
+        return self._penalty_curtailment
+    
+    @penalty_curtailment.setter
+    def penalty_curtailment(self, val: float):
+        self._penalty_curtailment = float(val)
+        
+    @property
+    def penalty_redispatching(self) -> cp.Parameter:
+        return self._penalty_redispatching
+    
+    @penalty_redispatching.setter
+    def penalty_redispatching(self, val: float):
+        self._penalty_redispatching = float(val)
+        
+    @property
+    def penalty_storage(self) -> cp.Parameter:
+        return self._penalty_storage
+    
+    @penalty_storage.setter
+    def penalty_storage(self, val: float):
+        self._penalty_storage = float(val)
+        
+    def _update_topo_param(self, obs: BaseObservation):
+        tmp_ = 1 * obs.line_or_to_subid
+        tmp_ [obs.line_or_bus == 2] += obs.n_sub
+        self.bus_or.value[:] = tmp_
+        tmp_ = 1 * obs.line_ex_to_subid
+        tmp_ [obs.line_ex_bus == 2] += obs.n_sub
+        self.bus_ex.value[:] = tmp_
+        
+        # "disconnect" in the model the line disconnected
+        # it should be equilavent to connect them all (at both side) to the slack
+        self.bus_ex.value [(obs.line_or_bus == -1) | (obs.line_ex_bus == -1)] = 0
+        self.bus_or.value [(obs.line_or_bus == -1) | (obs.line_ex_bus == -1)] = 0
+         
+        tmp_ = obs.load_to_subid
+        tmp_[obs.load_bus == 2] += obs.n_sub
+        self.bus_load.value[:] = tmp_
+        
+        tmp_ = obs.gen_to_subid
+        tmp_[obs.gen_bus == 2] += obs.n_sub
+        self.bus_gen.value[:] = tmp_
+        
+        tmp_ = obs.storage_to_subid
+        tmp_[obs.storage_bus == 2] += obs.n_sub
+        self.bus_storage.value[:] = tmp_
+        
+    def _update_th_lim_param(self, obs: BaseObservation):
+        self._th_lim_mw.value[:] =  (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2
+        self._th_lim_mw.value[:] = np.sqrt(self._th_lim_mw.value)
+        # TODO what if it's negative !
+        
+    def _update_inj_param(self, obs: BaseObservation):
+        self.load_per_bus.value[:] = 0.
+        self.gen_per_bus.value[:] = 0.
+        load_p = 1.0 * obs.load_p
+        load_p *= (obs.gen_p.sum() - obs.storage_power.sum()) / load_p.sum() 
+        for bus_id in range(self.nb_max_bus):
+            self.load_per_bus.value[bus_id] += load_p[self.bus_load.value == bus_id].sum()
+            self.load_per_bus.value[bus_id] += obs.storage_power[self.bus_storage.value == bus_id].sum()
+            self.gen_per_bus.value[bus_id] += obs.gen_p[self.bus_gen.value == bus_id].sum()
+
+    def _add_redisp_const(self, obs: BaseObservation, bus_id: int):
+        # add the constraint on the redispatching
+        self.redisp_up.value[bus_id] = obs.gen_margin_up[self.bus_gen.value == bus_id].sum()
+        self.redisp_down.value[bus_id] = obs.gen_margin_down[self.bus_gen.value == bus_id].sum()
+    
+    def _add_storage_const(self, obs: BaseObservation, bus_id: int):
+        # limit in MW
+        stor_down = obs.storage_max_p_prod[self.bus_storage.value == bus_id].sum()
+        # limit due to energy (if almost empty)
+        stor_down = np.minimum(stor_down,
+                                obs.storage_charge[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time) 
+                                )
+        self.storage_down.value[bus_id] = stor_down
+        
+        # limit in MW
+        stor_up = obs.storage_max_p_absorb[self.bus_storage.value == bus_id].sum()
+        # limit due to energy (if almost full)
+        stor_up = np.minimum(stor_up,
+                                (obs.storage_Emax - obs.storage_charge)[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time) 
+                                )
+        self.storage_up.value[bus_id] = stor_up
+            
+    def _update_constraints_param_unsafe(self, obs: BaseObservation):
+        tmp_ = 1.0 * obs.gen_p
+        tmp_[~obs.gen_renewable] = 0.
+        
+        for bus_id in range(self.nb_max_bus):
+            # redispatching
+            self._add_redisp_const(obs, bus_id) 
+            
+            # curtailment
+            self.curtail_down.value[bus_id] = 0.
+            self.curtail_up.value[bus_id] = tmp_[(self.bus_gen.value == bus_id) & obs.gen_renewable].sum()
+            
+            # storage
+            self._add_storage_const(obs, bus_id)
+            
+        self._remove_margin_rounding()
+        
+    def _remove_margin_rounding(self):
+        self.storage_down.value[self.storage_down.value > self.margin_rounding] -= self.margin_rounding
+        self.storage_up.value[self.storage_up.value > self.margin_rounding] -= self.margin_rounding
+        self.curtail_down.value[self.curtail_down.value > self.margin_rounding] -= self.margin_rounding
+        self.curtail_up.value[self.curtail_up.value > self.margin_rounding] -= self.margin_rounding
+        self.redisp_up.value[self.redisp_up.value > self.margin_rounding] -= self.margin_rounding
+        self.redisp_down.value[self.redisp_down.value > self.margin_rounding] -= self.margin_rounding
+        
+    def _validate_param_values(self):
+        self.storage_down._validate_value(self.storage_down.value)
+        self.storage_up._validate_value(self.storage_up.value)
+        self.curtail_down._validate_value(self.curtail_down.value)
+        self.curtail_up._validate_value(self.curtail_up.value)
+        self.redisp_up._validate_value(self.redisp_up.value)
+        self.redisp_down._validate_value(self.redisp_down.value)
+        self._th_lim_mw._validate_value(self._th_lim_mw.value)
+         
+    def update_parameters(self, obs: BaseObservation, unsafe: bool = True):
+        ## update the topology information
+        self._update_topo_param(obs)
+        
+        ## update the thermal limit
+        self._update_th_lim_param(obs)
+        
+        ## update the load / gen bus injected values
+        self._update_inj_param(obs)
+
+        ## update the constraints parameters
+        if unsafe:
+            self._update_constraints_param_unsafe(obs)
+        else:
+            self._update_constraints_param_safe(obs)
+        
+        # check that all parameters have correct values
+        # for example non negative values for non negative parameters
+        self._validate_param_values()
+    
+    def _aux_compute_kcl(self, inj_bus, f_or):
+        KCL_eq = []
+        for bus_id in range(self.nb_max_bus):
+            tmp = inj_bus[bus_id]
+            if np.any(self.bus_or.value == bus_id):
+                tmp +=  cp.sum(f_or[self.bus_or.value == bus_id])
+            if np.any(self.bus_ex.value == bus_id):
+                tmp -=  cp.sum(f_or[self.bus_ex.value == bus_id])
+            KCL_eq.append(tmp)
+        return KCL_eq
+    
+    def _mask_theta_zero(self):
+        theta_is_zero = np.full(self.nb_max_bus, True, bool)
+        theta_is_zero[self.bus_or.value] = False
+        theta_is_zero[self.bus_ex.value] = False
+        theta_is_zero[self.bus_load.value] = False
+        theta_is_zero[self.bus_gen.value] = False
+        theta_is_zero[self.bus_storage.value] = False
+        theta_is_zero[0] = True  # slack bus
+        return theta_is_zero
+        
+    def compute_optimum_unsafe(self):
+        # variables
+        theta = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        curtailment_mw = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        storage = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        
+        # usefull quantities
+        f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
+        inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
+        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
+        
+        KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
+        theta_is_zero = self._mask_theta_zero()
+        
+        # constraints
+        constraints = ( # slack bus
+                        [theta[theta_is_zero] == 0] + 
+                        
+                        # KCL
+                        [el == 0 for el in KCL_eq] +
+                        
+                        # limit redispatching to possible values
+                        [redispatching <= self.redisp_up, redispatching >= -self.redisp_down] +
+                        # limit curtailment
+                        [curtailment_mw <= self.curtail_up, curtailment_mw >= -self.curtail_down] +
+                        # limit storage
+                        [storage <= self.storage_up, storage >= -self.storage_down] +
+                        
+                        # bus and generator variation should sum to 0. (not sure it's mandatory)
+                        [energy_added == 0]
+                      )
+        
+        # objective
+        # cost = cp.norm1(gp_var) + cp.norm1(lp_var)
+        cost = ( self._penalty_curtailment * cp.sum_squares(curtailment_mw) + 
+                 self._penalty_storage * cp.sum_squares(storage) +
+                 self._penalty_redispatching * cp.sum_squares(redispatching) +
+                 cp.sum_squares(cp.pos(cp.abs(f_or) - self._margin_th_limit * self._th_lim_mw))
+        )
+        
+        # solve
+        prob = cp.Problem(cp.Minimize(cost), constraints)
+        has_converged = self._solve_problem(prob)
+        
+        if has_converged:
+            self.flow_computed[:] = f_or.value
+            res = (curtailment_mw.value, storage.value, redispatching.value)
+        else:
+            self.logger.error(f"Problem with the optimization for all tested solvers ({type(self).SOLVER_TYPES})")
+            self.flow_computed[:] = np.NaN
+            tmp_ = np.zeros(shape=self.nb_max_bus)
+            res = (1.0 * tmp_, 1.0 * tmp_, 1.0 * tmp_)
+        
+        return  res
+    
+    def _solve_problem(self, prob, solver_type=None):
+        """
+        try different solvers until one finds a good solution...
+        Not pretty at all...
+        """
+        if solver_type is None:
+            for solver_type in type(self).SOLVER_TYPES:
+                res = self._solve_problem(prob, solver_type=solver_type)
+                if res:
+                    self.logger.info(f"Solver {solver_type} has converged. Stopping there.")
+                    return True
+            return False
+        
+        try:
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore")
+                tmp_ = prob.solve(solver=solver_type)
+                
+            if np.isfinite(tmp_):
+                return True
+            else:
+                self.logger.warning(f"Problem with the optimization for {solver_type}, infinite value")
+                raise cp.error.SolverError("Infinite value")
+
+        except cp.error.SolverError as exc_:
+            self.logger.warning(f"Problem with the optimization for {solver_type}: {exc_}")
+            return False
+            
+    def _clean_vect(self, curtailment, storage, redispatching):
+        """remove the value too small and set them at 0."""
+        curtailment[np.abs(curtailment) <= self.margin_sparse] = 0.
+        storage[np.abs(storage) <= self.margin_sparse] = 0.
+        redispatching[np.abs(redispatching) <= self.margin_sparse] = 0.
+        
+    def to_grid2op(self,
+                   obs,
+                   curtailment: np.ndarray,
+                   storage: np.ndarray,
+                   redispatching: np.ndarray) -> BaseAction:
+        self._clean_vect(curtailment, storage, redispatching)
+        
+        act = self.action_space()
+        
+        # storage
+        storage_ = np.zeros(shape=act.n_storage)
+        storage_[:] = storage[self.bus_storage.value]
+        # TODO what is multiple storage on a single bus ?
+        act.storage_p = storage_
+        
+        # curtailment
+        # becarefull here, the curtailment is given by the optimizer
+        # in the amount of MW you remove, grid2op
+        # expects a maximum value
+        curtailment_ = np.zeros(shape=act.n_gen) -1.
+        gen_curt = obs.gen_renewable & (obs.gen_p > 0.1)
+        idx_gen = self.bus_gen.value[gen_curt]
+        tmp_ = curtailment[idx_gen]
+        modif_gen_optim = tmp_ != 0.
+        gen_p = 1.0 * obs.gen_p
+        aux_ = curtailment_[gen_curt]
+        aux_[modif_gen_optim] = (gen_p[gen_curt][modif_gen_optim] - 
+                                 tmp_[modif_gen_optim] * 
+                                 gen_p[gen_curt][modif_gen_optim] / 
+                                 self.gen_per_bus.value[idx_gen][modif_gen_optim]
+        )
+        aux_[~modif_gen_optim] = -1.
+        curtailment_[gen_curt] = aux_
+        curtailment_[~gen_curt] = -1.
+        act.curtail_mw = curtailment_
+        
+        # redispatching
+        redisp_ = np.zeros(obs.n_gen)
+        gen_redi = obs.gen_redispatchable & (obs.gen_p > 0.1)
+        idx_gen = self.bus_gen.value[gen_redi]
+        tmp_ = redispatching[idx_gen]
+        redisp_[gen_redi] = tmp_ *  gen_p[gen_redi] / self.gen_per_bus.value[idx_gen]
+        redisp_[~gen_redi] = 0.
+        act.redispatch = redisp_
+        return act
+    
+    def _update_constraints_param_safe(self, obs):
+        tmp_ = 1.0 * obs.gen_p
+        tmp_[~obs.gen_renewable] = 0.
+        
+        for bus_id in range(self.nb_max_bus):
+            # redispatching
+            self._add_redisp_const(obs, bus_id) 
+            
+            # storage
+            self._add_storage_const(obs, bus_id)
+            
+            # curtailment
+            # self.curtail_down.value[bus_id] = 0.
+            # self.curtail_up.value[bus_id] = tmp_[(self.bus_gen.value == bus_id) & obs.gen_renewable].sum()
+            
+        self._remove_margin_rounding()
+    
+    def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
+        if l_id is not None:
+            # TODO why reconnecting it on busbar 1 ?
+            self.bus_ex.value[l_id] = obs.line_ex_to_subid[l_id]
+            self.bus_or.value[l_id] = obs.line_or_to_subid[l_id]
+        
+        # variables
+        theta = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        curtailment_mw = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        storage = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        
+        #  stuff to put elsewhere (TODO)
+        past_dispatch = cp.Parameter(shape=self.nb_max_bus,
+                                     value=np.zeros(self.nb_max_bus)
+                                     )  # at each bus
+        for bus_id in range(self.nb_max_bus):
+            past_dispatch.value[bus_id] = obs.target_dispatch[self.bus_gen.value == bus_id].sum()
+        past_state_of_charge = cp.Parameter(shape=self.nb_max_bus,
+                                            value=np.zeros(self.nb_max_bus),
+                                            nonneg=True
+                                            )  # at each bus
+        for bus_id in range(self.nb_max_bus):
+            past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
+        
+        # TODO put that in constructor with possibility to modify it !
+        SoC = np.zeros(shape=self.nb_max_bus)
+        for bus_id in range(self.nb_max_bus):
+            SoC[bus_id] = 0.5 * obs.storage_Emax[self.bus_storage.value == bus_id].sum()
+        storage_target = cp.Parameter(shape=self.nb_max_bus,
+                                      value=1.0 * SoC,
+                                      nonneg=True)
+        
+        # usefull quantities
+        f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
+        inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
+        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
+        
+        KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
+        theta_is_zero = self._mask_theta_zero()
+        
+        dispatch_after_this = past_dispatch + redispatching
+        state_of_charge_after = past_state_of_charge + storage / (60. / obs.delta_time)
+        
+        # constraints
+        constraints = ( # slack bus
+                        [theta[theta_is_zero] == 0] + 
+                        
+                        # KCL
+                        [el == 0 for el in KCL_eq] +
+                        
+                        # I impose here that the flows are bellow the limits
+                        [f_or <= self._margin_th_limit * self._th_lim_mw] +
+                        [f_or >= -self._margin_th_limit * self._th_lim_mw] +
+                        
+                        # limit redispatching to possible values
+                        [redispatching <= self.redisp_up, redispatching >= -self.redisp_down] +
+                        # limit curtailment
+                        [curtailment_mw <= self.curtail_up, curtailment_mw >= -self.curtail_down] +
+                        # limit storage
+                        [storage <= self.storage_up, storage >= -self.storage_down] +
+                        
+                        # bus and generator variation should sum to 0. (not sure it's mandatory)
+                        [energy_added == 0]
+                      )
+        
+        # TODO (in ctor) redisp_target
+        # TODO (in ctor) curtail_target
+        
+        # objective
+        # cost = cp.norm1(gp_var) + cp.norm1(lp_var)
+        cost = ( self._penalty_curtailment * cp.sum_squares(curtailment_mw) + 
+                 self._penalty_storage * cp.sum_squares(storage) +
+                 self._penalty_redispatching * cp.sum_squares(redispatching) +
+                 cp.sum_squares(dispatch_after_this)  +
+                 cp.sum_squares(state_of_charge_after - storage_target)
+        )
+        
+        # solve
+        prob = cp.Problem(cp.Minimize(cost), constraints)
+        has_converged = self._solve_problem(prob)
+        
+        if has_converged:
+            self.flow_computed[:] = f_or.value
+            res = (curtailment_mw.value, storage.value, redispatching.value)
+        else:
+            self.logger.error(f"Problem with the optimization for all tested solvers ({type(self).SOLVER_TYPES})")
+            self.flow_computed[:] = np.NaN
+            tmp_ = np.zeros(shape=self.nb_max_bus)
+            res = (1.0 * tmp_, 1.0 * tmp_, 1.0 * tmp_)
+        
+        return  res
+    
+    def act(self,
+            obs: BaseObservation,
+            reward: float,
+            done: bool) -> BaseAction:
+        
+        self.flow_computed[:] = np.NaN
+        if obs.rho.max() > self.rho_danger:
+            # I attempt to make the grid more secure
+            
+            # update the observation
+            self.update_parameters(obs)
+            # solve the problem
+            curtailment, storage, redispatching = self.compute_optimum_unsafe()
+            # get back the grid2op representation
+            act = self.to_grid2op(obs, curtailment, storage, redispatching)
+        elif obs.rho.max() < self.rho_safe:
+            # I attempt to get back to a more robust state (reconnect powerlines,
+            # storage state of charge close to the target state of charge,
+            # redispatching close to 0.0 etc.)
+            act = self.action_space()
+            
+            can_be_reco = (obs.time_before_cooldown_line == 0) & (~obs.line_status)
+            l_id = None
+            if np.any(can_be_reco):
+                # powerlines are not in cooldown
+                # I attempt to reconnect one of them (first one in the list)
+                l_id = np.where(can_be_reco)[0][0]
+                # TODO optimization to chose the "best" line to reconnect
+                act.line_set_status = [(l_id, +1)]
+            
+            # TODO
+            self.update_parameters(obs, unsafe=False)
+            curtailment, storage, redispatching = self.compute_optimum_safe(obs, l_id)
+            # get back the grid2op representation
+            act = self.to_grid2op(obs, curtailment, storage, redispatching)
+        else:
+            # I do nothing between rho_danger and rho_safe
+            act = self.action_space()
+        return act
+        
+if __name__ == "__main__":
+    pass
diff --git a/l2rpn_baselines/README.md b/l2rpn_baselines/README.md
index bb55ebd..4eddd0c 100644
--- a/l2rpn_baselines/README.md
+++ b/l2rpn_baselines/README.md
@@ -1,16 +1,21 @@
 # L2RPN Baselines
 
-This package holds reference baselines for the [L2RPN challenge](https://l2rpn.chalearn.org/)
+This package holds implementation baselines for the [L2RPN challenge](https://l2rpn.chalearn.org/)
 
 We thank kindly all baselines [contributors](../AUTHORS.txt).
 
-*Disclaimer* All baselines shown in this code are used to serve as example. They are in no way optimal and none of them
+*Disclaimer* All baselines shown in this code are used to serve as example, good practices or demonstrate some concepts. They are in no way optimal and none of them
 (to our knowledge) have been calibrated (learning rate is not tuned, neither is the number of layers, the size
 of each layers, the activation functions etc.)
 
 
 ## 1. Current available baselines
 
+A list of top performers to some of the past L2RPN competitions can
+be found in the documentation at https://l2rpn-baselines.readthedocs.io/en/latest/external_contributions.html
+
+In this package will find some other implementation (not tuned, to serve as examples):
+
  - [Template](/l2rpn_baselines/Template):
 
    This a template baseline, provided as an example for contributors.
@@ -18,47 +23,29 @@ of each layers, the activation functions etc.)
  - [DoNothing](/l2rpn_baselines/DoNothing):
 
    The most simple baseline, that takes no actions until it fails.
+   
+ - [ExpertAgent](/l2rpn_baselines/ExpertAgent)
 
- - [DoubleDuelingDQN](/l2rpn_baselines/DoubleDuelingDQN):
-
-   An example of a Double-DQN implementation.
-
- - [DoubleDuelingRDQN](/l2rpn_baselines/DoubleDuelingRDQN):
-
-   An example of a Recurrent Deep-Q Network implementation.
+   An "expert" agent. It uses some expert knowledge about powergrid and graph theory to
+   take action facing when there are some overloads.  
 
- - [SliceRDQN](/l2rpn_baselines/SliceRDQN):
+ - [PPO_RLLIB](/l2rpn_baselines/PPO_RLLIB)
 
-   A multi Recurrent Q-streams implementation.
-   Where each action class has it's own Q network embedded in the global net. 
+   Demonstrates how to use a PPO model (reinforcement learning model that achieved good
+   performances in some L2RPN competitions) with "ray / rllib" RL framework.
 
- - [DeepQSimple](/l2rpn_baselines/DeepQSimple):
+ - [PPO_SB3](/l2rpn_baselines/PPO_SB3)
 
-   A simple implementation of the Deep Q Learning algorithm
-   
- - [DuelQSimple](/l2rpn_baselines/DuelQSimple):
+   Demonstrates how to use a PPO model (reinforcement learning model that achieved good
+   performances in some L2RPN competitions) with "stable baselines 3" RL framework.
 
-   An alternative implementation to the Double DQN implementation. 
-   
- - [DuelQLeapNet](/l2rpn_baselines/DuelQLeapNet):
+ - [OptimCVXPY](/l2rpn_baselines/OptimCVXPY)
 
-   Another alternative implementation to the Double DQN implementation that uses the LeapNet see 
-   [LeapNet](https://github.com/BDonnot/leap_net) as a way to model the Q-value.
- 
- - [PandapowerOPFAgent](/l2rpn_baselines/PandapowerOPFAgent) 
-   
-   A baseline thats uses an "Optimal Power Flow", a specific method develop by the power system community to 
-   control the flows.
-   
- - [Kaist](/l2rpn_baselines/Kaist)
- 
-   The winning agent of the WCCI 2020 competition based on graph neural networks and transformers.
-   
- - [ExpertAgent](/l2rpn_baselines/ExpertAgent)
+   Shows how to use a optimization package (in this case cvxpy) to build an
+   agent proposing actions computed from this optimizer. Similar to the
+   "RL" baseline, for this one the "optimization modeling" is far from 
+   state of the art and can be greatly improved.
 
-   An "expert" agent. It uses some expert knowledge about powergrid and graph theory to
-   take action facing when there are some overloads.  
-   
 ## 2. How to?
 
 ### 2.a Use a baseline
@@ -112,5 +99,3 @@ recommended that you post your baseline under one of the following license:
 - BSD clause 2
 - BSD clause 3 
 - MPL v2.0
-
-
diff --git a/l2rpn_baselines/SACOld/evaluate.py b/l2rpn_baselines/SACOld/evaluate.py
index 5e6c881..bd9ead5 100644
--- a/l2rpn_baselines/SACOld/evaluate.py
+++ b/l2rpn_baselines/SACOld/evaluate.py
@@ -38,6 +38,14 @@ def evaluate(env,
 
     Please use the new implementation instead.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Parameters
     ----------
     env: :class:`grid2op.Environment`
diff --git a/l2rpn_baselines/SACOld/sacOld.py b/l2rpn_baselines/SACOld/sacOld.py
index 90e8604..044da2e 100644
--- a/l2rpn_baselines/SACOld/sacOld.py
+++ b/l2rpn_baselines/SACOld/sacOld.py
@@ -15,6 +15,14 @@ class SACOld(DeepQAgent):
     """
     Do not use this SACOld class that has lots of known (but forgotten) issues.
     
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     .. warning::
         We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
         
diff --git a/l2rpn_baselines/SACOld/sacOld_NN.py b/l2rpn_baselines/SACOld/sacOld_NN.py
index 7d77a41..7f5fe88 100644
--- a/l2rpn_baselines/SACOld/sacOld_NN.py
+++ b/l2rpn_baselines/SACOld/sacOld_NN.py
@@ -32,6 +32,14 @@ class SACOld_NN(BaseDeepQ):
     """
     Constructs the desired soft actor critic network.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     Compared to other baselines shown elsewhere (*eg* :class:`l2rpn_baselines.DeepQSimple` or
     :class:`l2rpn_baselines.DeepQSimple`) the implementation of the SAC is a bit more tricky
     (and was most likely NOT done properly in this class). For a more correct implementation
diff --git a/l2rpn_baselines/SACOld/sacOld_NNParam.py b/l2rpn_baselines/SACOld/sacOld_NNParam.py
index 6d7f675..66bf6fd 100644
--- a/l2rpn_baselines/SACOld/sacOld_NNParam.py
+++ b/l2rpn_baselines/SACOld/sacOld_NNParam.py
@@ -15,6 +15,14 @@ class SACOld_NNParam(NNParam):
     """
     Do not use this SACOld class that has lots of known (but forgotten) issues.
     
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     .. warning::
         We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
         
diff --git a/l2rpn_baselines/SACOld/train.py b/l2rpn_baselines/SACOld/train.py
index ab5523b..1153b53 100755
--- a/l2rpn_baselines/SACOld/train.py
+++ b/l2rpn_baselines/SACOld/train.py
@@ -33,6 +33,14 @@ def train(env,
     """
     This function implements the "training" part of the baselines "SAC" (old buggy implementation).
     
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
     .. warning::
         We plan to add SAC based agents relying on external frameworks, such as stable baselines3 or ray / rllib.
         
diff --git a/l2rpn_baselines/SliceRDQN/evaluate.py b/l2rpn_baselines/SliceRDQN/evaluate.py
index fdb7655..9560913 100755
--- a/l2rpn_baselines/SliceRDQN/evaluate.py
+++ b/l2rpn_baselines/SliceRDQN/evaluate.py
@@ -54,7 +54,15 @@ def evaluate(env,
              max_steps=DEFAULT_MAX_STEPS,
              verbose=DEFAULT_VERBOSE,
              save_gif=False):
-
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+    """
     import tensorflow as tf
     # Limit gpu usage
     physical_devices = tf.config.list_physical_devices('GPU')
diff --git a/l2rpn_baselines/SliceRDQN/sliceRDQN.py b/l2rpn_baselines/SliceRDQN/sliceRDQN.py
index 827e25e..aefa61e 100644
--- a/l2rpn_baselines/SliceRDQN/sliceRDQN.py
+++ b/l2rpn_baselines/SliceRDQN/sliceRDQN.py
@@ -27,6 +27,16 @@
 
 
 class SliceRDQN(AgentWithConverter):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     def __init__(self,
                  observation_space,
                  action_space,
diff --git a/l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py b/l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py
index 2da6ed8..f6ae01d 100644
--- a/l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py
+++ b/l2rpn_baselines/SliceRDQN/sliceRDQN_NN.py
@@ -22,6 +22,16 @@
 
 
 class SliceRDQN_NN(object):
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     def __init__(self,
                  action_size,
                  observation_shape,
diff --git a/l2rpn_baselines/SliceRDQN/train.py b/l2rpn_baselines/SliceRDQN/train.py
index b6326c6..e3f31e4 100755
--- a/l2rpn_baselines/SliceRDQN/train.py
+++ b/l2rpn_baselines/SliceRDQN/train.py
@@ -75,6 +75,16 @@ def train(env,
           learning_rate=DEFAULT_LR,
           verbose=DEFAULT_VERBOSE):
 
+    """
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+    """
     import tensorflow as tf
     # Set config
     RDQNConfig.LR = learning_rate
diff --git a/l2rpn_baselines/__init__.py b/l2rpn_baselines/__init__.py
index 90cc3fc..735c0bf 100644
--- a/l2rpn_baselines/__init__.py
+++ b/l2rpn_baselines/__init__.py
@@ -8,16 +8,19 @@
 
 all_baselines_li = [
     "Template",
-    "DoubleDuelingDQN",
-    "DoubleDuelingRDQN",
     "DoNothing",
     "ExpertAgent",
+    "PPO_RLLIB",
+    "PPO_SB3",
+    "OptimCVXPY",
+    # Backward compatibility
+    "SACOld",
+    "DoubleDuelingDQN",
+    "DoubleDuelingRDQN",
     "SliceRDQN",
     "DeepQSimple",
     "DuelQSimple",
     "LeapNetEncoded",
-    # Backward compatibility
-    "SACOld",
     # contribution
     "PandapowerOPFAgent",
     "Geirina",
diff --git a/l2rpn_baselines/test/test_optimcvxpy.py b/l2rpn_baselines/test/test_optimcvxpy.py
new file mode 100644
index 0000000..6f79402
--- /dev/null
+++ b/l2rpn_baselines/test/test_optimcvxpy.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import unittest
+import warnings
+import grid2op
+from lightsim2grid import LightSimBackend
+from grid2op.Action import PlayableAction
+from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
+from grid2op.Parameters import Parameters
+
+import pdb
+
+class TestOptimCVXPY(unittest.TestCase):
+    def _aux_check_type(self, act, line_status=False, redisp=True):
+        # return
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert not injection
+        assert not voltage
+        assert not topology
+        if line_status:
+            assert line
+        else:
+            assert not line
+        if redisp:
+            assert redispatching
+        else:
+            assert not redispatching
+        assert storage
+        assert curtailment
+    
+    def _aux_create_env_setup(self, param=None):
+        if param is None:
+            param = Parameters()
+            param.NO_OVERFLOW_DISCONNECTION = True
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make("educ_case14_storage",
+                                backend=LightSimBackend(),
+                                action_class=PlayableAction,
+                                param=param,
+                                test=True)
+        env.set_id(2)
+        env.seed(0)
+        env.reset()
+        env.fast_forward_chronics(215)
+        return env
+        
+    def test_unsafe(self):
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85)
+        
+        obs, reward, done, info = env.step(env.action_space())
+        # max rhos of the 3 following step if I do nothing
+        max_rhos = [1.0063555, 1.0104821, 1.0110041]
+        
+        act = agent.act(obs, None, None)
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[0], f"{obs.rho.max()} >= {max_rhos[0]}"
+        
+        act = agent.act(obs, None, None)
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[1], f"{obs.rho.max()} >= {max_rhos[1]}"
+        
+        act = agent.act(obs, None, None)
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[2], f"{obs.rho.max()} >= {max_rhos[2]}"
+        
+    def test_unsafe_linedisc(self):
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85)
+        
+        l_id_disc = 4
+        obs, reward, done, info = env.step(env.action_space({"set_line_status": [(l_id_disc, -1)]}))
+        assert not done
+        assert obs.rho[l_id_disc] <= 1e-6, f"{obs.rho[l_id_disc]} > 1e-6"
+        
+        # max rhos of the 3 following step if I do nothing
+        max_rhos = [1.006486, 1.0111672, 1.0115097]        
+        
+        act = agent.act(obs, None, None)
+        assert agent.flow_computed[l_id_disc] <= 1e-6, f"{agent.flow_computed[l_id_disc]} > 1e-6"
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[0], f"{obs.rho.max()} >= {max_rhos[0]}"
+        
+        act = agent.act(obs, None, None)
+        assert agent.flow_computed[l_id_disc] <= 1e-6, f"{agent.flow_computed[l_id_disc]} > 1e-6"
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[1], f"{obs.rho.max()} >= {max_rhos[1]}"
+        
+        act = agent.act(obs, None, None)
+        assert agent.flow_computed[l_id_disc] <= 1e-6, f"{agent.flow_computed[l_id_disc]} > 1e-6"
+        self._aux_check_type(act)
+        obs, reward, done, info = env.step(act)
+        assert not info["exception"]
+        assert not done
+        assert obs.rho.max() < 1.0, f"{obs.rho.max()} >= 1.0"
+        assert obs.rho.max() < max_rhos[2], f"{obs.rho.max()} >= {max_rhos[2]}"
+    
+    def test_safe_do_reco(self):
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space,
+                           env,
+                           rho_safe=9.5,
+                           rho_danger=10.,
+                           margin_th_limit=0.9)
+        
+        l_id_disc = 4
+        obs, reward, done, info = env.step(env.action_space({"set_line_status": [(l_id_disc, -1)]}))
+        assert not done
+        act = agent.act(obs, None, None)
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert line
+    
+    def test_safe_dont_reco_cooldown(self):
+        param = Parameters()
+        param.NB_TIMESTEP_COOLDOWN_LINE = 3
+        param.NO_OVERFLOW_DISCONNECTION = True
+        env = self._aux_create_env_setup(param=param)
+        agent = OptimCVXPY(env.action_space,
+                           env,
+                           rho_safe=9.5,
+                           rho_danger=10.,
+                           margin_th_limit=0.9)
+        
+        l_id_disc = 4
+        # a cooldown applies, agent does not reconnect it
+        obs, reward, done, info = env.step(env.action_space({"set_line_status": [(l_id_disc, -1)]}))
+        assert not done
+        act = agent.act(obs, None, None)
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert not line
+        # still a cooldown
+        obs, reward, done, info = env.step(env.action_space())
+        assert not done
+        act = agent.act(obs, None, None)
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert not line
+        # still a cooldown
+        obs, reward, done, info = env.step(env.action_space())
+        assert not done
+        act = agent.act(obs, None, None)
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert not line
+        
+        # no more cooldown, it should reconnect it
+        obs, reward, done, info = env.step(env.action_space())
+        assert not done
+        act = agent.act(obs, None, None)
+        types = act.get_types()
+        injection, voltage, topology, line, redispatching, storage, curtailment = types
+        assert line
+    
+    def test_safe_setback_redisp(self):
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space,
+                           env,
+                           rho_safe=9.5,
+                           rho_danger=10.,
+                           margin_th_limit=10.0)
+        act_prev = env.action_space()
+        act_prev.redispatch = [3.0, 4.0, 0.0, 0.0, 0.0, -7.0]
+        obs, reward, done, info = env.step(act_prev)
+        assert not done
+        act = agent.act(obs, None, None)
+        pdb.set_trace()
+        print(act)
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/l2rpn_baselines/utils/baseDeepQ.py b/l2rpn_baselines/utils/baseDeepQ.py
index 043d7c6..a44f104 100644
--- a/l2rpn_baselines/utils/baseDeepQ.py
+++ b/l2rpn_baselines/utils/baseDeepQ.py
@@ -29,6 +29,18 @@ class BaseDeepQ(ABC):
     This class aims at representing the Q value (or more in case of SAC) parametrization by
     a neural network.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+        Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics`
+        classes to train agent interacting with grid2op and fully compatible
+        with gym framework.	
+        
     It is composed of 2 different networks:
 
     - model: which is the main model
diff --git a/l2rpn_baselines/utils/deepQAgent.py b/l2rpn_baselines/utils/deepQAgent.py
index 679e6b9..4adabf3 100644
--- a/l2rpn_baselines/utils/deepQAgent.py
+++ b/l2rpn_baselines/utils/deepQAgent.py
@@ -34,6 +34,18 @@ class DeepQAgent(AgentWithConverter):
     """
     This class allows to train and log the training of different Q learning algorithm.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+        Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics`
+        classes to train agent interacting with grid2op and fully compatible
+        with gym framework.	
+        
     It is not meant to be the state of the art implement of some baseline. It is rather meant to be a set of
     useful functions that allows to easily develop an environment if we want to get started in RL using grid2op.
 
diff --git a/l2rpn_baselines/utils/nnParam.py b/l2rpn_baselines/utils/nnParam.py
index 6a7c762..cb74f55 100644
--- a/l2rpn_baselines/utils/nnParam.py
+++ b/l2rpn_baselines/utils/nnParam.py
@@ -21,6 +21,18 @@ class NNParam(object):
 
     It is recommended to overload this class for each specific model.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+        Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics`
+        classes to train agent interacting with grid2op and fully compatible
+        with gym framework.	
+        
     Attributes
     ----------
 
diff --git a/l2rpn_baselines/utils/rlAgent.py b/l2rpn_baselines/utils/rlAgent.py
deleted file mode 100644
index 044a91b..0000000
--- a/l2rpn_baselines/utils/rlAgent.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2020, RTE (https://www.rte-france.com)
-# See AUTHORS.txt
-# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
-# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
-# you can obtain one at http://mozilla.org/MPL/2.0/.
-# SPDX-License-Identifier: MPL-2.0
-# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
-
-import os
-import numpy as np
-
-from grid2op.Agent import BaseAgent
-from l2rpn_baselines.utils.trainingParam import TrainingParam
-
-
-class RLAgent(BaseAgent):
-    def __init__(self, name):
-        raise NotImplementedError()
-        self.name = name
-
-        self._training_param = None
-        self._tf_writer = None
-
-        self.brain = None  # the "stuff" that takes the decisions
-
-        self._max_iter_env_ = 1000000
-        self._curr_iter_env = 0
-        self._max_reward = 0.
-
-        # action type
-        self.nb_injection = 0
-        self.nb_voltage = 0
-        self.nb_topology = 0
-        self.nb_line = 0
-        self.nb_redispatching = 0
-        self.nb_do_nothing = 0
-
-        # for over sampling the hard scenarios
-        self._prev_obs_num = 0
-        self._time_step_lived = None
-        self._nb_chosen = None
-        self._proba = None
-        self._prev_id = 0
-        # this is for the "limit the episode length" depending on your previous success
-        self._total_sucesses = 0
-
-    # BaseAgent interface
-    def act(self, obs, reward, done=False):
-        act = self.brain.predict(obs, reward, done, train=False)
-        return act
-
-    # Baseline interface
-    def load(self, path):
-        """
-        Part of the l2rpn_baselines interface, this function allows to read back a trained model, to continue the
-        training or to evaluate its performance for example.
-
-        **NB** To reload an agent, it must have exactly the same name and have been saved at the right location.
-
-        Parameters
-        ----------
-        path: ``str``
-            The path where the agent has previously beens saved.
-
-        """
-        # not modified compare to original implementation
-        tmp_me = os.path.join(path, self.name)
-        if not os.path.exists(tmp_me):
-            raise RuntimeError("The model should be stored in \"{}\". But this appears to be empty".format(tmp_me))
-        self._load_action_space(tmp_me)
-
-        # TODO handle case where training param class has been overidden
-        self._training_param = TrainingParam.from_json(os.path.join(tmp_me, "training_params.json".format(self.name)))
-        self.deep_q = self._nn_archi.make_nn(self._training_param)
-        try:
-            self.deep_q.load_network(tmp_me, name=self.name)
-        except Exception as e:
-            raise RuntimeError("Impossible to load the model located at \"{}\" with error \n{}".format(path, e))
-
-        for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]:
-            conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
-            if os.path.exists(conv_path):
-                setattr(self, nm_attr, np.load(file=conv_path))
diff --git a/l2rpn_baselines/utils/train_generic.py b/l2rpn_baselines/utils/train_generic.py
index 9e7a1ea..e0b3229 100644
--- a/l2rpn_baselines/utils/train_generic.py
+++ b/l2rpn_baselines/utils/train_generic.py
@@ -17,6 +17,18 @@ def train_generic(agent,
     """
     This function is a helper to train more easily some agent using their default "train" method.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+        Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics`
+        classes to train agent interacting with grid2op and fully compatible
+        with gym framework.	
+
     Parameters
     ----------
     agent: :class:`grid2op.Agent`
diff --git a/l2rpn_baselines/utils/trainingParam.py b/l2rpn_baselines/utils/trainingParam.py
index 22fdd95..722904e 100644
--- a/l2rpn_baselines/utils/trainingParam.py
+++ b/l2rpn_baselines/utils/trainingParam.py
@@ -15,6 +15,18 @@ class TrainingParam(object):
     A class to store the training parameters of the models. It was hard coded in the getting_started/notebook 3
     of grid2op and put in this repository instead.
 
+    .. warning::
+        This baseline recodes entire the RL training procedure. You can use it if you
+        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+        optimized, slow, etc. implementation ).
+        
+        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
+        or the "PPO_SB3" baseline.
+        
+        Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics`
+        classes to train agent interacting with grid2op and fully compatible
+        with gym framework.	
+        
     Attributes
     ----------
     buffer_size: ``int``
diff --git a/setup.py b/setup.py
index 254fd5d..fd53577 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,8 @@
             "sphinx>=2.4.4",
             "sphinx-rtd-theme>=0.4.3",
             "sphinxcontrib-trio>=1.1.0",
-            "autodocsumm>=0.2.7"
+            "autodocsumm>=0.2.7",
+            "cvxpy"
         ],
         "optional": ["grid2op[optional]>=1.6.5",
                      "tensorflow>=2.2.0",
@@ -36,7 +37,7 @@
         "PPO_RLLIB": ["ray[rllib]",
                       "jsonpickle",
                       "lightsim2grid"],
-        "PPO_SB3": ["stable_baselines3"]
+        "PPO_SB3": ["stable_baselines3", "lightsim2grid"]
     }
 }
 

From fc25ff5b84217808a1303c5856d0b809a6e52f47 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Wed, 4 May 2022 15:28:30 +0200
Subject: [PATCH 38/56] adding the feature to change storage charge setpoint in
 the optimcvxpy

---
 .gitignore                                    |   1 +
 docs/deepqsimple.rst                          |  12 +-
 docs/doubleduelingdqn.rst                     |  12 +-
 docs/doubleduelingrdqn.rst                    |  12 +-
 docs/duelqleapnet.rst                         |  12 +-
 docs/duelqsimple.rst                          |  12 +-
 docs/index.rst                                |  36 ++-
 docs/leapnetencoded.rst                       |  12 +-
 docs/ppo_rllib.rst                            |   6 +-
 docs/ppo_stable_baselines.rst                 |   2 +-
 .../ppo_stable_baselines/B_train_agent.py     |  11 +-
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py      | 214 ++++++++++++------
 l2rpn_baselines/test/test_optimcvxpy.py       |  69 +++++-
 13 files changed, 271 insertions(+), 140 deletions(-)

diff --git a/.gitignore b/.gitignore
index c835312..9481b54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,4 @@ line_act.json
 tensorboard/
 test_sac/
 documentation/
+test_issue_glop.py
diff --git a/docs/deepqsimple.rst b/docs/deepqsimple.rst
index 096e0fd..3edeb4c 100644
--- a/docs/deepqsimple.rst
+++ b/docs/deepqsimple.rst
@@ -11,12 +11,12 @@ baseline. Don't expect to obtain state of the art method with this simple method
 An example to train this model is available in the train function :ref:`Example-deepqsimple`
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Exported class
 --------------
diff --git a/docs/doubleduelingdqn.rst b/docs/doubleduelingdqn.rst
index 4df7c0d..fa1d146 100644
--- a/docs/doubleduelingdqn.rst
+++ b/docs/doubleduelingdqn.rst
@@ -11,12 +11,12 @@ This baseline is of type Double Duelling Deep Q Network, as in Duelling Q Networ
 It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results.
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Agent class
 ------------------------
diff --git a/docs/doubleduelingrdqn.rst b/docs/doubleduelingrdqn.rst
index 1b9c820..50cf7cb 100644
--- a/docs/doubleduelingrdqn.rst
+++ b/docs/doubleduelingrdqn.rst
@@ -11,12 +11,12 @@ This baseline is of type Recurrent Double Duelling Deep Q Network, as in Duellin
 It's main purpose is to provide an example of this network type running with Grid2Op. However, don't expect to obtain state of the art results.
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Agent class
 ------------------------
diff --git a/docs/duelqleapnet.rst b/docs/duelqleapnet.rst
index 3a8f88f..0c77085 100644
--- a/docs/duelqleapnet.rst
+++ b/docs/duelqleapnet.rst
@@ -18,12 +18,12 @@ In this baseline, we use this very same architecture to model the Q function. Th
 An example to train this model is available in the train function :ref:`Example-leapnet`.
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Exported class
 --------------
diff --git a/docs/duelqsimple.rst b/docs/duelqsimple.rst
index 175dcfa..a34cd49 100644
--- a/docs/duelqsimple.rst
+++ b/docs/duelqsimple.rst
@@ -13,12 +13,12 @@ baseline. Don't expect to obtain state of the art method with this simple method
 An example to train this model is available in the train function :ref:`Example-duelqsimple`.
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Exported class
 --------------
diff --git a/docs/index.rst b/docs/index.rst
index dd7c839..3d820f4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,8 +17,8 @@ are already implemented in this package.
    template
    donothing
 
-Open source libraries
-----------------------
+Some RL implementation examples
+---------------------------------
 
 Lots of reinforcement learning algorithms are already implemented by state of
 the art libraries heavily maintained and updated. 
@@ -32,9 +32,11 @@ learning to the power grid control problem.
 
    ppo_rllib
    ppo_stable_baselines
+   external_contributions
+
 
-Other contributions
----------------------
+Expert systems and optimizers
+------------------------------
 
 In this section, we grouped up some noticeable contributions for the powergrid control 
 problem. 
@@ -49,13 +51,19 @@ in some environment.
 
    expertagent
    optimcvxpy
-   external_contributions
 
-Custom implementation
+
+Possible implementation
 ---------------------------
 
 .. note::
-   WORK IN PROGRESS
+   Most of the codes below are legacy code that will not be updated and contains
+   (most likely) lots of bugs, inefficiencies and "not so great" code.
+
+   It's totally fine to use them if you want to dive deep into implementation.
+   For most usage however, we strongly encourage you to check out the
+   :class:`l2rpn_baselines.PPO_SB3.PPO_SB3` or the 
+   :class:`l2rpn_baselines.PPO_RLLIB.PPO_RLLIB`.
 
 For more "in depth" look at what is possible to do, we also wrote some 
 custom implementation of some reinforcement learning algorithms.
@@ -75,20 +83,6 @@ using grid2op more closely that through the gym interface.
    duelqleapnet
    doubleduelingrdqn
    leapnetencoded
-
-
-Deprecated baselines
----------------------------
-
-.. warning::
-   These are "deprecated", won't be fixed / maintained and are not likely to work.
-
-This section is mainly here for "history".
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Deprecated baselines
-
    sacold
 
 Indices and tables
diff --git a/docs/leapnetencoded.rst b/docs/leapnetencoded.rst
index ea8a9a2..c617216 100644
--- a/docs/leapnetencoded.rst
+++ b/docs/leapnetencoded.rst
@@ -22,12 +22,12 @@ a leap net) that parametrized the Q function.
 An example to train this model is available in the train function :ref:`Example-leapnetenc`.
 
 .. warning::
-        This baseline recodes entire the RL training procedure. You can use it if you
-        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
-        optimized, slow, etc. implementation ).
-        
-        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
-        or the "PPO_SB3" baseline.
+    This baseline recodes entire the RL training procedure. You can use it if you
+    want to have a deeper look at Deep Q Learning algorithm and a possible (non 
+    optimized, slow, etc. implementation ).
+    
+    For a much better implementation, you can reuse the code of :class:`l2rpn_baselines.PPO_RLLIB` 
+    or the :class:`l2rpn_baselines.PPO_SB3` baseline.
         
 Exported class
 --------------
diff --git a/docs/ppo_rllib.rst b/docs/ppo_rllib.rst
index 725f720..9641de6 100644
--- a/docs/ppo_rllib.rst
+++ b/docs/ppo_rllib.rst
@@ -1,4 +1,4 @@
-.. currentmodule:: l2rpn_baselines.ppo_stablebaselines
+.. currentmodule:: l2rpn_baselines.PPO_RLLIB
 
 PPO: with ray/rllib
 ===========================================================
@@ -15,7 +15,7 @@ by applying `redispatching` kind of action for controlable generators or
 by with `curtailment` on generator using new renewable energy sources - solar and wind
 or even to control the state of the storage units.)
 
-It is pretty much the same as the :class:`l2rpn_baselines.PPO_SB3` but uses
+It is pretty much the same as the :class:`l2rpn_baselines.PPO_RLLIB` but uses
 rllib instead of stable Baselines3.
 
 Exported class
@@ -194,6 +194,6 @@ For example, to create an agent **from scratch**, with some parameters:
 Detailed documentation
 ++++++++++++++++++++++++
 
-.. automodule:: l2rpn_baselines.PPO_SB3
+.. automodule:: l2rpn_baselines.PPO_RLLIB
     :members:
     :autosummary:
diff --git a/docs/ppo_stable_baselines.rst b/docs/ppo_stable_baselines.rst
index 3a48cb6..74001a6 100644
--- a/docs/ppo_stable_baselines.rst
+++ b/docs/ppo_stable_baselines.rst
@@ -1,4 +1,4 @@
-.. currentmodule:: l2rpn_baselines.ppo_stablebaselines
+.. currentmodule:: l2rpn_baselines.PPO_SB3
 
 PPO: with stable-baselines3
 ===========================================================
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 06bdd6b..3c82921 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -20,8 +20,9 @@
 
 env_name = "l2rpn_icaps_2021_small_train"
 env_name = "l2rpn_wcci_2022_dev_train"
+env_name = "wcci_2022_dev_2"
 save_path = "./saved_model"
-name = "expe_GymEnvWithRecoWithDN_2022_test4"
+name = "expe_GymEnvWithRecoWithDN_2022_test5"
 gymenv_class = GymEnvWithRecoWithDN  # uses the heuristic to do nothing is the grid is not at risk and to reconnect powerline automatically
 max_iter = 7 * 24 * 12  # None to deactivate it
 safe_max_rho = 0.9  # the grid is said "safe" if the rho is lower than this value, it is a really important parameter to tune !
@@ -135,12 +136,18 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        reward_class=CustomReward,
                        backend=LightSimBackend(),
                        chronics_class=MultifolderWithCache)
+    param = env.parameters
+    param.LIMIT_INFEASIBLE_CURTAILMENT_STORAGE_ACTION = True
+    env.change_parameters(param)
+    
     if max_iter is not None:
         env.set_max_iter(max_iter)  # one week
     obs = env.reset()
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
-    env.chronics_handler.real_data.set_filter(lambda x: True)
+    # env.chronics_handler.real_data.set_filter(lambda x: True)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-08-01_.*$", x) is not None)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index a8cb47f..d6c6c06 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -9,6 +9,7 @@
 from typing import Optional
 import logging
 import warnings
+from attr import has
 import cvxpy as cp
 import numpy as np
 
@@ -81,11 +82,17 @@ class OptimCVXPY(BaseAgent):
     nb_max_bus: `int`	
         Maximum number of buses allowed in the powergrid.
     
-    _penalty_curtailment: ` cp.Parameter`
+    _penalty_curtailment_unsafe: ` cp.Parameter`
 
-    _penalty_redispatching: `cp.Parameter`
+    _penalty_redispatching_unsafe: `cp.Parameter`
 
-    _penalty_storag: `cp.Parameter`
+    _penalty_storage_unsafe: `cp.Parameter`
+        
+    _penalty_curtailment_safe: ` cp.Parameter`
+
+    _penalty_redispatching_safe: `cp.Parameter`
+
+    _penalty_storage_safe: `cp.Parameter`
         
     bus_or: `cp.Parameter`
     
@@ -125,7 +132,9 @@ class OptimCVXPY(BaseAgent):
         A logger to log information about the optimization process.
     
     """
-    SOLVER_TYPES = [cp.SCS, cp.OSQP, cp.SCIPY]
+    SOLVER_TYPES = [cp.OSQP, cp.SCS, cp.SCIPY]
+    # NB: SCIPY rarely converge
+    # SCS converged almost all the time, but is inaccurate
     
     def __init__(self,
                  action_space : ActionSpace,
@@ -134,9 +143,14 @@ def __init__(self,
                  margin_th_limit: float=0.9,
                  rho_danger: float=0.95,
                  rho_safe: float=0.85,
-                 penalty_curtailment: float=0.1,
-                 penalty_redispatching: float=0.03,
-                 penalty_storage: float=0.3,
+                 penalty_curtailment_unsafe: float=0.1,
+                 penalty_redispatching_unsafe: float=0.03,
+                 penalty_storage_unsafe: float=0.3,
+                 penalty_curtailment_safe: float=0.0,
+                 penalty_redispatching_safe: float=0.00,
+                 weight_redisp_target: float=1.0,
+                 weight_storage_target: float=1.0,
+                 penalty_storage_safe: float=0.0,
                  margin_rounding: float=0.01,
                  margin_sparse: float=1e-4,
                  logger : Optional[logging.Logger]=None) -> None:
@@ -177,21 +191,40 @@ def __init__(self,
             "safe grid" optimization routine and try to set back the grid into
             a reference state.
         
-        penalty_curtailment: `float`
-            The cost of applying a curtailment in the objective function.
+        penalty_curtailment_unsafe: `float`
+            The cost of applying a curtailment in the objective function. Applies only in "unsafe" mode.
+        
+            Default value is 0.1, should be >= 0.
+            
+        penalty_redispatching_unsafe: `float`
+            The cost of applying a redispatching in the objective function. Applies only in "unsafe" mode.
+            
+            Default value is 0.03, should be >= 0.
+        
+        penalty_storage_unsafe: `float`
+            The cost of applying a storage in the objective function. Applies only in "unsafe" mode.
+              
+            Default value is 0.3, should be >= 0.
+            
+        penalty_curtailment_safe: `float`
+            The cost of applying a curtailment in the objective function. Applies only in "safe" mode.
         
-            Default value is 0.1.
+            Default value is 0.0, should be >= 0.
             
-        penalty_redispatching: `float`
-            The cost of applying a redispatching in the objective function.
+        penalty_redispatching_unsafe: `float`
+            The cost of applying a redispatching in the objective function. Applies only in "safe" mode.
             
-            Default value is 0.03.
+            Default value is 0.0, should be >= 0.
         
-        penalty_storage: `float`
-            The cost of applying a storage in the objective function.
+        penalty_storage_unsafe: `float`
+            The cost of applying a storage in the objective function. Applies only in "safe" mode.
               
-            Default value is 0.3.
+            Default value is 0.0, should be >= 0.
                    
+        weight_storage_target: `float`
+        
+        weight_redisp_target: `float`
+        
         margin_rounding: `float`
             A margin taken to avoid rounding issues that could lead to infeasible
             actions due to "redispatching above max_ramp_up" for example.
@@ -217,12 +250,35 @@ def __init__(self,
         BaseAgent.__init__(self, action_space)
         self._margin_th_limit: cp.Parameter = cp.Parameter(value=margin_th_limit,
                                                            nonneg=True)
-        self._penalty_curtailment: cp.Parameter = cp.Parameter(value=penalty_curtailment,
-                                                               nonneg=True)
-        self._penalty_redispatching: cp.Parameter = cp.Parameter(value=penalty_redispatching,
-                                                                 nonneg=True)
-        self._penalty_storage: cp.Parameter = cp.Parameter(value=penalty_storage,
-                                                           nonneg=True)
+        self._penalty_curtailment_unsafe: cp.Parameter = cp.Parameter(value=penalty_curtailment_unsafe,
+                                                                      nonneg=True)
+        self._penalty_redispatching_unsafe: cp.Parameter = cp.Parameter(value=penalty_redispatching_unsafe,
+                                                                        nonneg=True)
+        self._penalty_storage_unsafe: cp.Parameter = cp.Parameter(value=penalty_storage_unsafe,
+                                                                  nonneg=True)
+        
+        self._penalty_curtailment_safe: cp.Parameter = cp.Parameter(value=penalty_curtailment_safe,
+                                                                    nonneg=True)
+        self._penalty_redispatching_safe: cp.Parameter = cp.Parameter(value=penalty_redispatching_safe,
+                                                                      nonneg=True)
+        self._penalty_storage_safe: cp.Parameter = cp.Parameter(value=penalty_storage_safe,
+                                                                nonneg=True)
+
+        self._weight_redisp_target: cp.Parameter = cp.Parameter(value=weight_redisp_target,
+                                                                nonneg=True)
+        self._weight_storage_target: cp.Parameter = cp.Parameter(value=weight_storage_target,
+                                                                nonneg=True)
+        
+        
+        self.nb_max_bus: int = 2 * env.n_sub
+        
+        SoC = np.zeros(shape=self.nb_max_bus)
+        self._storage_setpoint: np.ndarray = 0.5 * env.storage_Emax
+        for bus_id in range(self.nb_max_bus):
+            SoC[bus_id] = 0.5 * self._storage_setpoint[env.storage_to_subid == bus_id].sum()
+        self._storage_target_bus = cp.Parameter(shape=self.nb_max_bus,
+                                                value=1.0 * SoC,
+                                                nonneg=True)
         
         self.margin_rounding: float = float(margin_rounding)
         self.margin_sparse: float = float(margin_sparse)
@@ -262,7 +318,6 @@ def __init__(self,
         
         # TODO replace all below with sparse matrices
         # to be able to change the topology more easily
-        self.nb_max_bus: int = 2 * env.n_sub
         self.bus_or: cp.Parameter = cp.Parameter(shape=env.n_line,
                                                  value=env.line_or_to_subid,
                                                  integer=True)
@@ -312,6 +367,16 @@ def __init__(self,
                                                      value=env.get_thermal_limit(),
                                                      nonneg=True)
         
+        self._past_dispatch = cp.Parameter(shape=self.nb_max_bus,
+                                           value=np.zeros(self.nb_max_bus)
+                                           ) 
+        self._past_state_of_charge = cp.Parameter(shape=self.nb_max_bus,
+                                                  value=np.zeros(self.nb_max_bus),
+                                                  nonneg=True
+                                                  )
+        
+        self._v_ref: np.ndarray = 1.0 * env.get_obs().v_or
+        
         if logger is None:
             self.logger: logging.Logger = logging.getLogger(__name__)
             self.logger.disabled = False
@@ -331,27 +396,36 @@ def margin_th_limit(self, val: float):
         
     @property
     def penalty_curtailment(self) -> cp.Parameter:
-        return self._penalty_curtailment
+        return self._penalty_curtailment_unsafe
     
     @penalty_curtailment.setter
     def penalty_curtailment(self, val: float):
-        self._penalty_curtailment = float(val)
+        self._penalty_curtailment_unsafe = float(val)
         
     @property
     def penalty_redispatching(self) -> cp.Parameter:
-        return self._penalty_redispatching
+        return self._penalty_redispatching_unsafe
     
     @penalty_redispatching.setter
     def penalty_redispatching(self, val: float):
-        self._penalty_redispatching = float(val)
+        self._penalty_redispatching_unsafe = float(val)
         
     @property
     def penalty_storage(self) -> cp.Parameter:
-        return self._penalty_storage
+        return self._penalty_storage_unsafe
     
     @penalty_storage.setter
     def penalty_storage(self, val: float):
-        self._penalty_storage = float(val)
+        self._penalty_storage_unsafe = float(val)
+        
+    @property
+    def storage_setpoint(self) -> cp.Parameter:
+        return self._storage_setpoint
+    
+    @storage_setpoint.setter
+    def storage_setpoint(self, val: np.ndarray):
+        self._storage_setpoint.value[:] = np.array(val).astype(float)
+        
         
     def _update_topo_param(self, obs: BaseObservation):
         tmp_ = 1 * obs.line_or_to_subid
@@ -379,9 +453,15 @@ def _update_topo_param(self, obs: BaseObservation):
         self.bus_storage.value[:] = tmp_
         
     def _update_th_lim_param(self, obs: BaseObservation):
+        # take into account reactive value (and current voltage) in thermal limit
         self._th_lim_mw.value[:] =  (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2
         self._th_lim_mw.value[:] = np.sqrt(self._th_lim_mw.value)
-        # TODO what if it's negative !
+        
+        # do whatever you can for disconnected lines
+        index_disc = obs.v_or == 0.
+        self._th_lim_mw.value[index_disc] = 0.001 * (obs.thermal_limit * self._v_ref )[index_disc] * np.sqrt(3.)
+        
+        # TODO what if (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2 is negative !
         
     def _update_inj_param(self, obs: BaseObservation):
         self.load_per_bus.value[:] = 0.
@@ -432,7 +512,7 @@ def _update_constraints_param_unsafe(self, obs: BaseObservation):
             
         self._remove_margin_rounding()
         
-    def _remove_margin_rounding(self):
+    def _remove_margin_rounding(self):            
         self.storage_down.value[self.storage_down.value > self.margin_rounding] -= self.margin_rounding
         self.storage_up.value[self.storage_up.value > self.margin_rounding] -= self.margin_rounding
         self.curtail_down.value[self.curtail_down.value > self.margin_rounding] -= self.margin_rounding
@@ -448,6 +528,9 @@ def _validate_param_values(self):
         self.redisp_up._validate_value(self.redisp_up.value)
         self.redisp_down._validate_value(self.redisp_down.value)
         self._th_lim_mw._validate_value(self._th_lim_mw.value)
+        self._storage_target_bus._validate_value(self._storage_target_bus.value)
+        self._past_dispatch._validate_value(self._past_dispatch.value)
+        self._past_state_of_charge._validate_value(self._past_state_of_charge.value)
          
     def update_parameters(self, obs: BaseObservation, unsafe: bool = True):
         ## update the topology information
@@ -525,9 +608,9 @@ def compute_optimum_unsafe(self):
         
         # objective
         # cost = cp.norm1(gp_var) + cp.norm1(lp_var)
-        cost = ( self._penalty_curtailment * cp.sum_squares(curtailment_mw) + 
-                 self._penalty_storage * cp.sum_squares(storage) +
-                 self._penalty_redispatching * cp.sum_squares(redispatching) +
+        cost = ( self._penalty_curtailment_unsafe * cp.sum_squares(curtailment_mw) + 
+                 self._penalty_storage_unsafe * cp.sum_squares(storage) +
+                 self._penalty_redispatching_unsafe * cp.sum_squares(redispatching) +
                  cp.sum_squares(cp.pos(cp.abs(f_or) - self._margin_th_limit * self._th_lim_mw))
         )
         
@@ -562,12 +645,12 @@ def _solve_problem(self, prob, solver_type=None):
         try:
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore")
-                tmp_ = prob.solve(solver=solver_type)
+                tmp_ = prob.solve(solver=solver_type, warm_start=False)  # prevent warm start (for now)
                 
             if np.isfinite(tmp_):
                 return True
             else:
-                self.logger.warning(f"Problem with the optimization for {solver_type}, infinite value")
+                self.logger.warning(f"Problem with the optimization for {solver_type}, infinite value returned")
                 raise cp.error.SolverError("Infinite value")
 
         except cp.error.SolverError as exc_:
@@ -584,10 +667,12 @@ def to_grid2op(self,
                    obs,
                    curtailment: np.ndarray,
                    storage: np.ndarray,
-                   redispatching: np.ndarray) -> BaseAction:
+                   redispatching: np.ndarray,
+                   act=None) -> BaseAction:
         self._clean_vect(curtailment, storage, redispatching)
         
-        act = self.action_space()
+        if act is None:
+            act = self.action_space()
         
         # storage
         storage_ = np.zeros(shape=act.n_storage)
@@ -629,7 +714,6 @@ def to_grid2op(self,
     def _update_constraints_param_safe(self, obs):
         tmp_ = 1.0 * obs.gen_p
         tmp_[~obs.gen_renewable] = 0.
-        
         for bus_id in range(self.nb_max_bus):
             # redispatching
             self._add_redisp_const(obs, bus_id) 
@@ -640,7 +724,17 @@ def _update_constraints_param_safe(self, obs):
             # curtailment
             # self.curtail_down.value[bus_id] = 0.
             # self.curtail_up.value[bus_id] = tmp_[(self.bus_gen.value == bus_id) & obs.gen_renewable].sum()
+
+            # storage target
+            self._storage_target_bus.value[bus_id] = self._storage_setpoint[self.bus_storage.value == bus_id].sum()
+            
+            # past information
+            self._past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
+            self._past_dispatch.value[bus_id] = obs.target_dispatch[self.bus_gen.value == bus_id].sum()
             
+        self.curtail_down.value[:] = 0.  # TODO
+        self.curtail_up.value[:] = 0.  # TODO
+        
         self._remove_margin_rounding()
     
     def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
@@ -655,27 +749,6 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         storage = cp.Variable(shape=self.nb_max_bus)  # at each bus
         redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus
         
-        #  stuff to put elsewhere (TODO)
-        past_dispatch = cp.Parameter(shape=self.nb_max_bus,
-                                     value=np.zeros(self.nb_max_bus)
-                                     )  # at each bus
-        for bus_id in range(self.nb_max_bus):
-            past_dispatch.value[bus_id] = obs.target_dispatch[self.bus_gen.value == bus_id].sum()
-        past_state_of_charge = cp.Parameter(shape=self.nb_max_bus,
-                                            value=np.zeros(self.nb_max_bus),
-                                            nonneg=True
-                                            )  # at each bus
-        for bus_id in range(self.nb_max_bus):
-            past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
-        
-        # TODO put that in constructor with possibility to modify it !
-        SoC = np.zeros(shape=self.nb_max_bus)
-        for bus_id in range(self.nb_max_bus):
-            SoC[bus_id] = 0.5 * obs.storage_Emax[self.bus_storage.value == bus_id].sum()
-        storage_target = cp.Parameter(shape=self.nb_max_bus,
-                                      value=1.0 * SoC,
-                                      nonneg=True)
-        
         # usefull quantities
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
         inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
@@ -684,8 +757,8 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
         theta_is_zero = self._mask_theta_zero()
         
-        dispatch_after_this = past_dispatch + redispatching
-        state_of_charge_after = past_state_of_charge + storage / (60. / obs.delta_time)
+        dispatch_after_this = self._past_dispatch + redispatching
+        state_of_charge_after = self._past_state_of_charge + storage / (60. / obs.delta_time)
         
         # constraints
         constraints = ( # slack bus
@@ -711,20 +784,19 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         
         # TODO (in ctor) redisp_target
         # TODO (in ctor) curtail_target
-        
+    
         # objective
         # cost = cp.norm1(gp_var) + cp.norm1(lp_var)
-        cost = ( self._penalty_curtailment * cp.sum_squares(curtailment_mw) + 
-                 self._penalty_storage * cp.sum_squares(storage) +
-                 self._penalty_redispatching * cp.sum_squares(redispatching) +
-                 cp.sum_squares(dispatch_after_this)  +
-                 cp.sum_squares(state_of_charge_after - storage_target)
+        cost = ( self._penalty_curtailment_safe * cp.sum_squares(curtailment_mw) + 
+                 self._penalty_storage_safe * cp.sum_squares(storage) +
+                 self._penalty_redispatching_safe * cp.sum_squares(redispatching) +
+                 self._weight_redisp_target * cp.sum_squares(dispatch_after_this)  +
+                 self._weight_storage_target * cp.sum_squares(state_of_charge_after - self._storage_target_bus)
         )
-        
+            
         # solve
         prob = cp.Problem(cp.Minimize(cost), constraints)
         has_converged = self._solve_problem(prob)
-        
         if has_converged:
             self.flow_computed[:] = f_or.value
             res = (curtailment_mw.value, storage.value, redispatching.value)
@@ -770,7 +842,7 @@ def act(self,
             self.update_parameters(obs, unsafe=False)
             curtailment, storage, redispatching = self.compute_optimum_safe(obs, l_id)
             # get back the grid2op representation
-            act = self.to_grid2op(obs, curtailment, storage, redispatching)
+            act = self.to_grid2op(obs, curtailment, storage, redispatching, act)
         else:
             # I do nothing between rho_danger and rho_safe
             act = self.action_space()
diff --git a/l2rpn_baselines/test/test_optimcvxpy.py b/l2rpn_baselines/test/test_optimcvxpy.py
index 6f79402..bce804f 100644
--- a/l2rpn_baselines/test/test_optimcvxpy.py
+++ b/l2rpn_baselines/test/test_optimcvxpy.py
@@ -6,14 +6,19 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+from pickle import FALSE
 import unittest
 import warnings
+import numpy as np
+
 import grid2op
-from lightsim2grid import LightSimBackend
 from grid2op.Action import PlayableAction
-from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
 from grid2op.Parameters import Parameters
 
+from lightsim2grid import LightSimBackend
+
+from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
+
 import pdb
 
 class TestOptimCVXPY(unittest.TestCase):
@@ -148,7 +153,7 @@ def test_safe_dont_reco_cooldown(self):
                            env,
                            rho_safe=9.5,
                            rho_danger=10.,
-                           margin_th_limit=0.9)
+                           margin_th_limit=10.)
         
         l_id_disc = 4
         # a cooldown applies, agent does not reconnect it
@@ -180,6 +185,8 @@ def test_safe_dont_reco_cooldown(self):
         types = act.get_types()
         injection, voltage, topology, line, redispatching, storage, curtailment = types
         assert line
+        obs, reward, done, info = env.step(act)
+        assert obs.line_status[l_id_disc]
     
     def test_safe_setback_redisp(self):
         env = self._aux_create_env_setup()
@@ -187,13 +194,63 @@ def test_safe_setback_redisp(self):
                            env,
                            rho_safe=9.5,
                            rho_danger=10.,
-                           margin_th_limit=10.0)
+                           margin_th_limit=10.0,
+                           weight_storage_target=0.
+                           )
         act_prev = env.action_space()
         act_prev.redispatch = [3.0, 4.0, 0.0, 0.0, 0.0, -7.0]
         obs, reward, done, info = env.step(act_prev)
+        disp_ref = 1.0 * obs.actual_dispatch
         assert not done
         act = agent.act(obs, None, None)
-        pdb.set_trace()
-        print(act)
+        obs, reward, done, info = env.step(act)
+        assert not done
+        # now check that it has set back the redispatching to a closer value to the reference
+        assert np.sum(obs.actual_dispatch**2) < np.sum(disp_ref**2)  
+        
+    def test_safe_setback_storage(self):
+        param = Parameters()
+        param.NO_OVERFLOW_DISCONNECTION = True
+        param.ACTIVATE_STORAGE_LOSS = False  # otherwise it makes tests more complicated
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space,
+                           env,
+                           rho_safe=9.5,
+                           rho_danger=10.,
+                           margin_th_limit=10.0,
+                           weight_redisp_target=0.
+                           )
+        act_prev = env.action_space()
+        act_prev.storage_p = [4.9, -9.9]
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        for i in range(5): # more than 5 iterations and I got an error due to rounding
+            obs_before = obs.copy()
+            act = agent.act(obs_before, None, None)
+            obs, reward, done, info = env.step(act)
+            assert not info["exception"]
+            assert not done
+            assert (np.sum((obs.storage_charge - 0.5 * obs.storage_Emax)**2) <= 
+                    np.sum((obs_before.storage_charge - 0.5 * obs.storage_Emax)**2)), f"error at iteration {i}"
+        
+        env = self._aux_create_env_setup()
+        act_prev = env.action_space()
+        act_prev.storage_p = [4.9, 9.9]
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        obs, reward, done, info = env.step(act_prev)
+        for i in range(5): # more than 5 iterations and I got an error due to rounding
+            obs_before = obs.copy()
+            agent._DEBUG = True
+            act = agent.act(obs_before, None, None)
+            obs, reward, done, info = env.step(act)
+            assert not info["exception"]
+            assert not done
+            assert (np.sum((obs.storage_charge - 0.5 * obs.storage_Emax)**2) <= 
+                    np.sum((obs_before.storage_charge - 0.5 * obs.storage_Emax)**2)), f"error at iteration {i}"
+            
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

From 92e26f4f82ef7b23ef65ec29df92d8446e98aec0 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Thu, 5 May 2022 16:24:36 +0200
Subject: [PATCH 39/56] adding fine tuning parameters for the educ_case14 env

---
 examples/optim_cvxpy/Readme.md                |  27 ++
 .../optimcvxpy_educ_case14_storage.py         |  63 +++
 examples/optim_cvxpy/optimcvxpy_wcci_2022.py  |  69 +++
 examples/ppo_stable_baselines/ReadMe.md       |   2 +-
 l2rpn_baselines/OptimCVXPY/evaluate.py        |   3 +-
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py      | 416 ++++++++++++++----
 l2rpn_baselines/test/test_optimcvxpy.py       |  55 ++-
 7 files changed, 548 insertions(+), 87 deletions(-)
 create mode 100644 examples/optim_cvxpy/Readme.md
 create mode 100644 examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
 create mode 100644 examples/optim_cvxpy/optimcvxpy_wcci_2022.py

diff --git a/examples/optim_cvxpy/Readme.md b/examples/optim_cvxpy/Readme.md
new file mode 100644
index 0000000..69c6e26
--- /dev/null
+++ b/examples/optim_cvxpy/Readme.md
@@ -0,0 +1,27 @@
+# Objective
+
+This repo shows (and give some usefull parameters) how to use the optimization
+`OptimCVXPY` method to tackle the problem in grid2op.
+
+Parameters given here are not perfect but they allow to perform better than "do nothing"
+on the particular scenario selected (selection has been made uniformly at random
+between all scenario of the environment)
+
+## On the educ_case14_storage
+
+On this environment, the optimization procedure is pretty fast (~10-15 steps per second) and allow to 
+get through almost all the scenarios.
+
+It's probably possible to do better by fine tuning the other hyper parameters.
+
+You can have a look at the [**optimcvxpy_educ_case14_storage.py**](./optimcvxpy_educ_case14_storage.py) file for more information.
+
+## On the wcci 2022 environment
+
+For this environment, the model is pretty slow (sometimes 10-15s per step which is relatively important).
+This leads to around 30 mins for completing a full scenario of a week (2016 steps)
+
+Because it took long time to compute, we only manage to find "good" parameters to do better than do nothing
+for the selected scenarios.
+
+You can have a look at the [**optimcvxpy_wcci_2022.py**](./optimcvxpy_wcci_2022.py) file for more information.
diff --git a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
new file mode 100644
index 0000000..e6fde08
--- /dev/null
+++ b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import grid2op
+from grid2op.Action import PlayableAction
+from l2rpn_baselines.OptimCVXPY import OptimCVXPY
+from lightsim2grid import LightSimBackend
+from tqdm import tqdm
+import pdb
+
+env = grid2op.make("educ_case14_storage", test=True, backend=LightSimBackend(),
+                   action_class=PlayableAction)
+
+agent = OptimCVXPY(env.action_space,
+                   env,
+                   penalty_redispatching_unsafe=0.,
+                   penalty_storage_unsafe=0.1,
+                   penalty_curtailment_unsafe=0.01,
+                   rho_safe=0.85,
+                   rho_danger=0.9,
+                   margin_th_limit=0.93,
+                   alpha_por_error=0.5,
+                   weight_redisp_target=0.,
+                   )
+
+# in safe / recovery mode agent tries to fill the storage units as much as possible
+agent.storage_setpoint = env.storage_Emax  
+
+print("For do nothing: ")
+dn_act = env.action_space()
+for scen_id in range(7):
+    env.set_id(scen_id)
+    obs = env.reset()
+    done = False
+    for nb_step in tqdm(range(287)):
+        obs, reward, done, info = env.step(dn_act)
+        if done:
+            break
+    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 287")
+
+print("For the optimizer: ")
+for scen_id in range(7):
+    # if scen_id != 2:
+        # continue
+    
+    env.set_id(scen_id)
+    obs = env.reset()
+    agent.reset(obs)
+    done = False
+    for nb_step in tqdm(range(287)):
+        prev_obs = obs
+        act = agent.act(obs)
+        obs, reward, done, info = env.step(act)
+        if done:
+            print(prev_obs.storage_charge)
+            break
+    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 287")
diff --git a/examples/optim_cvxpy/optimcvxpy_wcci_2022.py b/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
new file mode 100644
index 0000000..6e7e46b
--- /dev/null
+++ b/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import grid2op
+from l2rpn_baselines.OptimCVXPY import OptimCVXPY
+from lightsim2grid import LightSimBackend
+from tqdm import tqdm
+import pdb
+
+env_name = "wcci_2022_dev"  # name subject to change
+is_test = False
+
+env = grid2op.make(env_name,
+                   test=is_test,
+                   backend=LightSimBackend()
+                   )
+
+agent = OptimCVXPY(env.action_space,
+                   env,
+                   penalty_redispatching_unsafe=0.,
+                   penalty_storage_unsafe=0.01,
+                   penalty_curtailment_unsafe=0.01,
+                   )
+
+scen_test = ["2050-01-03_31",
+             "2050-02-21_31",
+             "2050-03-07_31",
+             "2050-04-18_31",
+             "2050-05-09_31",
+             "2050-06-27_31",
+             "2050-07-25_31",
+             "2050-08-01_31",
+             "2050-09-26_31",
+             "2050-10-03_31",
+             "2050-11-14_31",
+             "2050-12-19_31",
+             ]
+scen_test = ["2050-02-21_31"]
+print("For do nothing: ")
+dn_act = env.action_space()
+for scen_id in scen_test:
+    env.set_id(scen_id)
+    obs = env.reset()
+    done = False
+    for nb_step in tqdm(range(obs.max_step)):
+        obs, reward, done, info = env.step(dn_act)
+        if done:
+            break
+    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {obs.max_step}")
+
+print("For the optimizer: ")
+for scen_id in scen_test:
+    env.set_id(scen_id)
+    obs = env.reset()
+    agent.reset(obs)
+    done = False
+    for nb_step in tqdm(range(obs.max_step)):
+        prev_obs = obs
+        act = agent.act(obs)
+        obs, reward, done, info = env.step(act)
+        if done:
+            break
+    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {obs.max_step}")
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
index 4185b0b..1cc926b 100644
--- a/examples/ppo_stable_baselines/ReadMe.md
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -1,6 +1,6 @@
 # Objective
 
-This repository demonstrates how to use grid2op, lightsim2grid and l2rpn-baselines to make a RL agent that is able to perform some actions on a grid2op environment using the PPO algorithm and the `stable-baselines3` rl library.
+This directory shows how to use grid2op, lightsim2grid and l2rpn-baselines to make a RL agent that is able to perform some actions on a grid2op environment using the PPO algorithm and the `stable-baselines3` rl library.
 
 It focuses on the `PPO_SB3` baseline with a strong focus on **continuous** variables (curtailment and redispatching)
 
diff --git a/l2rpn_baselines/OptimCVXPY/evaluate.py b/l2rpn_baselines/OptimCVXPY/evaluate.py
index bf20d6b..def4161 100644
--- a/l2rpn_baselines/OptimCVXPY/evaluate.py
+++ b/l2rpn_baselines/OptimCVXPY/evaluate.py
@@ -6,6 +6,7 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+
 def evaluate():
     # TODO !
-    pass
+    raise NotImplementedError("Currently not implemented")
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index d6c6c06..905e142 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -6,17 +6,17 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+import sys
 from typing import Optional
 import logging
 import warnings
-from attr import has
 import cvxpy as cp
 import numpy as np
 
 import grid2op
 from grid2op.Agent import BaseAgent
 from grid2op.Environment import Environment
-from grid2op.Action import PlayableAction, ActionSpace, BaseAction
+from grid2op.Action import ActionSpace, BaseAction
 from grid2op.Backend import PandaPowerBackend
 from grid2op.Observation import BaseObservation
 from lightsim2grid import LightSimBackend
@@ -27,6 +27,10 @@
 # TODO: "predictive control"
 # TODO: no flow in constraints but in objective function
 # TODO: reuse previous computations
+# TODO: do not act on storage units / curtailment if not possible by the action space
+# TODO have the agent "play" with the protection: if a powerline is not in danger,
+# the margin_th_limit associated should be larger than if a powerline is close to be disconnected
+
 class OptimCVXPY(BaseAgent):
     """
     This agent choses its action by resolving, at each `agent.act(...)` call an optimization routine
@@ -93,6 +97,10 @@ class OptimCVXPY(BaseAgent):
     _penalty_redispatching_safe: `cp.Parameter`
 
     _penalty_storage_safe: `cp.Parameter`
+    
+    _weight_redisp_target: `cp.Parameter`
+    
+    _weight_storage_target: `cp.Parameter`
         
     bus_or: `cp.Parameter`
     
@@ -141,6 +149,7 @@ def __init__(self,
                  env : Environment,
                  lines_x_pu: Optional[np.array]=None,
                  margin_th_limit: float=0.9,
+                 alpha_por_error: float=0.5,
                  rho_danger: float=0.95,
                  rho_safe: float=0.85,
                  penalty_curtailment_unsafe: float=0.1,
@@ -152,8 +161,9 @@ def __init__(self,
                  weight_storage_target: float=1.0,
                  penalty_storage_safe: float=0.0,
                  margin_rounding: float=0.01,
-                 margin_sparse: float=1e-4,
-                 logger : Optional[logging.Logger]=None) -> None:
+                 margin_sparse: float=5e-3,
+                 logger : Optional[logging.Logger]=None
+                 ) -> None:
         """Initialize this class
 
         Parameters
@@ -180,6 +190,9 @@ def __init__(self,
             `margin_th_limit * thermal_limit_mw`.
             
             The model is particularly sensitive to this parameter.
+        
+        alpha_por_error: `float`
+            TODO
             
         rho_danger: `float`
             If any `obs.rho` is above `rho_danger`, then the agent will use the
@@ -234,6 +247,9 @@ def __init__(self,
             to grid2op actions: if some values are below this value, then they are
             set to zero.
             
+            Defaults to 5e-3 (grid2op precision is 0.01 MW, anything below this will have no 
+            real impact anyway)
+            
         logger: `logging.Logger`
             A logger to log information about the optimization process.
     
@@ -247,6 +263,24 @@ def __init__(self,
             be inferred from the environment.
             
         """
+        if env.n_storage > 0 and not env.action_space.supports_type("set_storage"):
+            # TODO
+            raise RuntimeError("Impossible to create this class with an environment that does not allow "
+                               "modification of storage units when there are storage units on the grid. "
+                               "Allowing it would require only little changes, if you want it let us know "
+                               "with a github issue at https://github.com/rte-france/l2rpn-baselines/issues/new.")
+            
+        if np.any(env.gen_renewable) and not env.action_space.supports_type("curtail"):
+            # TODO
+            raise RuntimeError("Impossible to create this class with an environment that does not allow "
+                               "curtailment when there are renewable generators on the grid. "
+                               "Allowing it would require only little changes, if you want it let us know "
+                               "with a github issue at https://github.com/rte-france/l2rpn-baselines/issues/new.")
+            
+        if not env.action_space.supports_type("redispatch"):
+            raise RuntimeError("This type of agent can only perform actions using storage units, curtailment or"
+                               "redispatching. It requires at least to be able to do redispatching.")
+            
         BaseAgent.__init__(self, action_space)
         self._margin_th_limit: cp.Parameter = cp.Parameter(value=margin_th_limit,
                                                            nonneg=True)
@@ -268,8 +302,11 @@ def __init__(self,
                                                                 nonneg=True)
         self._weight_storage_target: cp.Parameter = cp.Parameter(value=weight_storage_target,
                                                                 nonneg=True)
-        
-        
+        # takes into account the previous errors on the flows (in an additive fashion)
+        # new flows are 1/x(theta_or - theta_ex) * alpha_por_error . (prev_flows - obs.p_or)
+        self._alpha_por_error: cp.Parameter = cp.Parameter(value=alpha_por_error,
+                                                           nonneg=True,
+                                                           )
         self.nb_max_bus: int = 2 * env.n_sub
         
         SoC = np.zeros(shape=self.nb_max_bus)
@@ -315,6 +352,9 @@ def __init__(self,
         self._powerlines_x: cp.Parameter = cp.Parameter(shape=powerlines_x.shape,
                                                         value=1.0 * powerlines_x,
                                                         pos=True)
+        self._prev_por_error: cp.Parameter = cp.Parameter(shape=powerlines_x.shape,
+                                                          value=np.zeros(env.n_line)
+                                                          )
         
         # TODO replace all below with sparse matrices
         # to be able to change the topology more easily
@@ -330,9 +370,12 @@ def __init__(self,
         self.bus_gen: cp.Parameter = cp.Parameter(shape=env.n_gen,
                                                   value=env.gen_to_subid,
                                                   integer=True)
-        self.bus_storage: cp.Parameter = cp.Parameter(shape=env.n_storage,
-                                                      value=env.storage_to_subid,
-                                                      integer=True)
+        if env.n_storage:
+            self.bus_storage: cp.Parameter = cp.Parameter(shape=env.n_storage,
+                                                          value=env.storage_to_subid,
+                                                          integer=True)
+        else:
+            self.bus_storage = None
         
         this_zeros_ = np.zeros(self.nb_max_bus)
         self.load_per_bus: cp.Parameter = cp.Parameter(shape=self.nb_max_bus,
@@ -374,12 +417,15 @@ def __init__(self,
                                                   value=np.zeros(self.nb_max_bus),
                                                   nonneg=True
                                                   )
-        
+
         self._v_ref: np.ndarray = 1.0 * env.get_obs().v_or
         
         if logger is None:
             self.logger: logging.Logger = logging.getLogger(__name__)
-            self.logger.disabled = False
+            self.logger.disabled = True
+            # self.logger.disabled = False
+            # self.logger.addHandler(logging.StreamHandler(sys.stdout))
+            # self.logger.setLevel(level=logging.DEBUG)
         else:
             self.logger: logging.Logger = logger.getChild("OptimCVXPY")
 
@@ -419,14 +465,31 @@ def penalty_storage(self, val: float):
         self._penalty_storage_unsafe = float(val)
         
     @property
-    def storage_setpoint(self) -> cp.Parameter:
+    def storage_setpoint(self) -> np.ndarray:
         return self._storage_setpoint
     
     @storage_setpoint.setter
     def storage_setpoint(self, val: np.ndarray):
-        self._storage_setpoint.value[:] = np.array(val).astype(float)
-        
+        self._storage_setpoint[:] = np.array(val).astype(float)
         
+    def reset(self, obs: BaseObservation):
+        """
+        This method is called at the beginning of a new episode.
+        It is implemented by agents to reset their internal state if needed.
+
+        Attributes
+        -----------
+        obs: :class:`grid2op.Observation.BaseObservation`
+            The first observation corresponding to the initial state of the environment.
+        """
+        self._prev_por_error.value[:] = 0.
+        conv_ = self.run_dc(obs)
+        if conv_:
+            self._prev_por_error.value[:] = self.flow_computed - obs.p_or
+        else:
+            self.logger.warning("Impossible to intialize the OptimCVXPY "
+                                "agent because the DC powerflow did not converge.")
+            
     def _update_topo_param(self, obs: BaseObservation):
         tmp_ = 1 * obs.line_or_to_subid
         tmp_ [obs.line_or_bus == 2] += obs.n_sub
@@ -448,9 +511,10 @@ def _update_topo_param(self, obs: BaseObservation):
         tmp_[obs.gen_bus == 2] += obs.n_sub
         self.bus_gen.value[:] = tmp_
         
-        tmp_ = obs.storage_to_subid
-        tmp_[obs.storage_bus == 2] += obs.n_sub
-        self.bus_storage.value[:] = tmp_
+        if self.bus_storage is not None:
+            tmp_ = obs.storage_to_subid
+            tmp_[obs.storage_bus == 2] += obs.n_sub
+            self.bus_storage.value[:] = tmp_
         
     def _update_th_lim_param(self, obs: BaseObservation):
         # take into account reactive value (and current voltage) in thermal limit
@@ -470,7 +534,8 @@ def _update_inj_param(self, obs: BaseObservation):
         load_p *= (obs.gen_p.sum() - obs.storage_power.sum()) / load_p.sum() 
         for bus_id in range(self.nb_max_bus):
             self.load_per_bus.value[bus_id] += load_p[self.bus_load.value == bus_id].sum()
-            self.load_per_bus.value[bus_id] += obs.storage_power[self.bus_storage.value == bus_id].sum()
+            if self.bus_storage is not None:
+                self.load_per_bus.value[bus_id] += obs.storage_power[self.bus_storage.value == bus_id].sum()
             self.gen_per_bus.value[bus_id] += obs.gen_p[self.bus_gen.value == bus_id].sum()
 
     def _add_redisp_const(self, obs: BaseObservation, bus_id: int):
@@ -479,20 +544,23 @@ def _add_redisp_const(self, obs: BaseObservation, bus_id: int):
         self.redisp_down.value[bus_id] = obs.gen_margin_down[self.bus_gen.value == bus_id].sum()
     
     def _add_storage_const(self, obs: BaseObservation, bus_id: int):
+        if self.bus_storage is None:
+            return
+            
         # limit in MW
         stor_down = obs.storage_max_p_prod[self.bus_storage.value == bus_id].sum()
         # limit due to energy (if almost empty)
-        stor_down = np.minimum(stor_down,
-                                obs.storage_charge[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time) 
-                                )
+        stor_down = min(stor_down,
+                        obs.storage_charge[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time) 
+                        )
         self.storage_down.value[bus_id] = stor_down
         
         # limit in MW
         stor_up = obs.storage_max_p_absorb[self.bus_storage.value == bus_id].sum()
         # limit due to energy (if almost full)
-        stor_up = np.minimum(stor_up,
-                                (obs.storage_Emax - obs.storage_charge)[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time) 
-                                )
+        stor_up = min(stor_up,
+                      (obs.storage_Emax - obs.storage_charge)[self.bus_storage.value == bus_id].sum() * (60. / obs.delta_time)
+                      )
         self.storage_up.value[bus_id] = stor_up
             
     def _update_constraints_param_unsafe(self, obs: BaseObservation):
@@ -504,8 +572,9 @@ def _update_constraints_param_unsafe(self, obs: BaseObservation):
             self._add_redisp_const(obs, bus_id) 
             
             # curtailment
-            self.curtail_down.value[bus_id] = 0.
-            self.curtail_up.value[bus_id] = tmp_[(self.bus_gen.value == bus_id) & obs.gen_renewable].sum()
+            mask_ = (self.bus_gen.value == bus_id) & obs.gen_renewable
+            self.curtail_down.value[bus_id] = 0.  # TODO obs.gen_p_before_curtail[mask_].sum() - tmp_[mask_].sum() ?
+            self.curtail_up.value[bus_id] = tmp_[mask_].sum()
             
             # storage
             self._add_storage_const(obs, bus_id)
@@ -540,7 +609,10 @@ def update_parameters(self, obs: BaseObservation, unsafe: bool = True):
         self._update_th_lim_param(obs)
         
         ## update the load / gen bus injected values
-        self._update_inj_param(obs)
+        self._update_inj_param(obs)  
+        # TODO have some kind of "state estimator"
+        # TODO to get the "best" p's at each nodes to match AC flows in observation
+        # TODO with a DC model
 
         ## update the constraints parameters
         if unsafe:
@@ -569,9 +641,79 @@ def _mask_theta_zero(self):
         theta_is_zero[self.bus_ex.value] = False
         theta_is_zero[self.bus_load.value] = False
         theta_is_zero[self.bus_gen.value] = False
-        theta_is_zero[self.bus_storage.value] = False
+        if self.bus_storage is not None:
+            theta_is_zero[self.bus_storage.value] = False
         theta_is_zero[0] = True  # slack bus
         return theta_is_zero
+    
+    def run_dc(self, obs: BaseObservation):
+        """This method allows to perform a dc approximation from
+        the state given by the observation.
+        
+        To make sure that `sum P = sum C` in this system, the **loads**
+        are scaled up.
+        
+        This function can primarily be used to retrieve the active power
+        in each branch of the grid.
+
+        Parameters
+        ----------
+        obs : BaseObservation
+            The observation (used to get the topology and the injections)
+        
+        Examples
+        ---------
+        You can use it with:
+        
+        .. code-block:: python
+        
+            import grid2op
+            from l2rpn_baselines.OptimCVXPY import OptimCVXPY
+
+            env_name = "l2rpn_case14_sandbox"
+            env = grid2op.make(env_name)
+
+            agent = OptimCVXPY(env.action_space, env)
+
+            obs = env.reset()
+            conv = agent.run_dc(obs)
+            if conv:
+                print(f"flows are: {agent.flow_computed}")
+            else:
+                print("DC powerflow has diverged")
+    
+        """
+        # update the parameters for the injection and topology
+        self._update_topo_param(obs)
+        self._update_inj_param(obs)
+        
+        # define the variables
+        theta = cp.Variable(shape=self.nb_max_bus)
+        
+        # temporary variables
+        f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
+        inj_bus = self.load_per_bus - self.gen_per_bus
+        KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
+        theta_is_zero = self._mask_theta_zero()    
+        # constraints
+        constraints = ([theta[theta_is_zero] == 0] + 
+                       [el == 0 for el in KCL_eq])
+        # no real cost here
+        cost = 1.
+        
+        # solve
+        prob = cp.Problem(cp.Minimize(cost), constraints)
+        has_converged = self._solve_problem(prob)
+        
+        # format the results
+        if has_converged:
+            self.flow_computed[:] = f_or.value
+        else:
+            self.logger.error(f"Problem with dc approximation for all solver ({type(self).SOLVER_TYPES}). "
+                              "Is your grid connected (one single connex component) ?")
+            self.flow_computed[:] = np.NaN
+            
+        return has_converged
         
     def compute_optimum_unsafe(self):
         # variables
@@ -582,6 +724,7 @@ def compute_optimum_unsafe(self):
         
         # usefull quantities
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
+        f_or_corr = f_or - self._alpha_por_error * self._prev_por_error
         inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
         energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
         
@@ -611,7 +754,7 @@ def compute_optimum_unsafe(self):
         cost = ( self._penalty_curtailment_unsafe * cp.sum_squares(curtailment_mw) + 
                  self._penalty_storage_unsafe * cp.sum_squares(storage) +
                  self._penalty_redispatching_unsafe * cp.sum_squares(redispatching) +
-                 cp.sum_squares(cp.pos(cp.abs(f_or) - self._margin_th_limit * self._th_lim_mw))
+                 cp.sum_squares(cp.pos(cp.abs(f_or_corr) - self._margin_th_limit * self._th_lim_mw))
         )
         
         # solve
@@ -659,56 +802,132 @@ def _solve_problem(self, prob, solver_type=None):
             
     def _clean_vect(self, curtailment, storage, redispatching):
         """remove the value too small and set them at 0."""
-        curtailment[np.abs(curtailment) <= self.margin_sparse] = 0.
-        storage[np.abs(storage) <= self.margin_sparse] = 0.
-        redispatching[np.abs(redispatching) <= self.margin_sparse] = 0.
+        curtailment[np.abs(curtailment) < self.margin_sparse] = 0.
+        storage[np.abs(storage) < self.margin_sparse] = 0.
+        redispatching[np.abs(redispatching) < self.margin_sparse] = 0.
         
     def to_grid2op(self,
-                   obs,
+                   obs: BaseObservation,
                    curtailment: np.ndarray,
                    storage: np.ndarray,
                    redispatching: np.ndarray,
-                   act=None) -> BaseAction:
+                   act: BaseAction =None) -> BaseAction:
+        """Convert the action (given as vectors of real number output of the optimizer)
+        to a valid grid2op action.
+
+        Parameters
+        ----------
+        obs : BaseObservation
+            The current observation, used to get some information about the grid
+            
+        curtailment : np.ndarray
+            Representation of the curtailment
+            
+        storage : np.ndarray
+            Action on storage units
+            
+        redispatching : np.ndarray
+            Action on redispatching
+            
+        act : BaseAction, optional
+            The previous action to modify (if any), by default None
+
+        Returns
+        -------
+        BaseAction
+            The action taken represented as a grid2op action
+        """
         self._clean_vect(curtailment, storage, redispatching)
         
         if act is None:
             act = self.action_space()
         
         # storage
-        storage_ = np.zeros(shape=act.n_storage)
-        storage_[:] = storage[self.bus_storage.value]
-        # TODO what is multiple storage on a single bus ?
-        act.storage_p = storage_
+        if act.n_storage and np.any(np.abs(storage) > 0.):
+            storage_ = np.zeros(shape=act.n_storage)
+            storage_[:] = storage[self.bus_storage.value]
+            # TODO what is multiple storage on a single bus ?
+            act.storage_p = storage_
         
         # curtailment
         # becarefull here, the curtailment is given by the optimizer
         # in the amount of MW you remove, grid2op
         # expects a maximum value
-        curtailment_ = np.zeros(shape=act.n_gen) -1.
-        gen_curt = obs.gen_renewable & (obs.gen_p > 0.1)
-        idx_gen = self.bus_gen.value[gen_curt]
-        tmp_ = curtailment[idx_gen]
-        modif_gen_optim = tmp_ != 0.
-        gen_p = 1.0 * obs.gen_p
-        aux_ = curtailment_[gen_curt]
-        aux_[modif_gen_optim] = (gen_p[gen_curt][modif_gen_optim] - 
-                                 tmp_[modif_gen_optim] * 
-                                 gen_p[gen_curt][modif_gen_optim] / 
-                                 self.gen_per_bus.value[idx_gen][modif_gen_optim]
-        )
-        aux_[~modif_gen_optim] = -1.
-        curtailment_[gen_curt] = aux_
-        curtailment_[~gen_curt] = -1.
-        act.curtail_mw = curtailment_
+        if np.any(np.abs(curtailment) > 0.):
+            curtailment_ = np.zeros(shape=act.n_gen) -1.
+            gen_curt = obs.gen_renewable & (obs.gen_p > 0.1)
+            idx_gen = self.bus_gen.value[gen_curt]
+            tmp_ = curtailment[idx_gen]
+            modif_gen_optim = tmp_ != 0.
+            gen_p = 1.0 * obs.gen_p
+            aux_ = curtailment_[gen_curt]
+            aux_[modif_gen_optim] = (gen_p[gen_curt][modif_gen_optim] - 
+                                     tmp_[modif_gen_optim] * 
+                                     gen_p[gen_curt][modif_gen_optim] / 
+                                     self.gen_per_bus.value[idx_gen][modif_gen_optim]
+            )
+            aux_[~modif_gen_optim] = -1.
+            curtailment_[gen_curt] = aux_
+            curtailment_[~gen_curt] = -1.
+            act.curtail_mw = curtailment_
         
         # redispatching
-        redisp_ = np.zeros(obs.n_gen)
-        gen_redi = obs.gen_redispatchable & (obs.gen_p > 0.1)
-        idx_gen = self.bus_gen.value[gen_redi]
-        tmp_ = redispatching[idx_gen]
-        redisp_[gen_redi] = tmp_ *  gen_p[gen_redi] / self.gen_per_bus.value[idx_gen]
-        redisp_[~gen_redi] = 0.
-        act.redispatch = redisp_
+        if np.any(np.abs(redispatching) > 0.):
+            redisp_ = np.zeros(obs.n_gen)
+            gen_redi = obs.gen_redispatchable  #  & (obs.gen_p > self.margin_sparse)
+            idx_gen = self.bus_gen.value[gen_redi]
+            tmp_ = redispatching[idx_gen]
+            gen_p = 1.0 * obs.gen_p
+            # TODO below, 1 issue: 
+            # it will necessarily turn on generators if one is connected to a bus and another not
+            redisp_avail = np.zeros(self.nb_max_bus)
+            for bus_id in range(self.nb_max_bus):
+                if redispatching[bus_id] > 0.:
+                    redisp_avail[bus_id] = obs.gen_margin_up[self.bus_gen.value == bus_id].sum()
+                elif redispatching[bus_id] < 0.:
+                    redisp_avail[bus_id] = obs.gen_margin_down[self.bus_gen.value == bus_id].sum()
+            # NB: I cannot reuse self.redisp_up above because i took some "margin" in the optimization
+            # this leads obs.gen_max_ramp_up / self.redisp_up to be > 1.0 and...
+            # violates the constraints of the environment...
+            
+            # below I compute the numerator: by what the total redispatching at each
+            # node should be split between the different generators connected to it
+            prop_to_gen = np.zeros(obs.n_gen)
+            redisp_up = np.zeros(obs.n_gen, dtype=bool)
+            redisp_up[gen_redi] = tmp_ > 0.
+            prop_to_gen[redisp_up] = obs.gen_max_ramp_up[redisp_up]
+            redisp_down = np.zeros(obs.n_gen, dtype=bool)
+            redisp_down[gen_redi] = tmp_ < 0.
+            prop_to_gen[redisp_down] = obs.gen_max_ramp_down[redisp_down]
+            
+            # avoid numeric issues
+            nothing_happens = (redisp_avail[idx_gen] == 0.) & (prop_to_gen[gen_redi] == 0.)
+            set_to_one_nothing = 1.0 * redisp_avail[idx_gen]
+            set_to_one_nothing[nothing_happens] = 1.0
+            redisp_avail[idx_gen] = set_to_one_nothing  # avoid 0. / 0. and python sends a warning
+            
+            if np.any(np.abs(redisp_avail[idx_gen]) <= self.margin_sparse):
+                self.logger.warning("Some generator have a dispatch assign to them by "
+                                    "the optimizer, but they don't have any margin. "
+                                    "The dispatch has been canceled (this was probably caused "
+                                    "by the optimizer not meeting certain constraints).")
+                this_fix_ = 1.0 * redisp_avail[idx_gen]
+                too_small_here = np.abs(this_fix_) <= self.margin_sparse
+                tmp_[too_small_here] = 0.
+                this_fix_[too_small_here] = 1.
+                redisp_avail[idx_gen] = this_fix_
+                
+            # Now I split the output of the optimization between the generators
+            try:
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("error")
+                    redisp_[gen_redi] = tmp_ * prop_to_gen[gen_redi] / redisp_avail[idx_gen]
+            except Exception as exc_:
+                print("a warning occured")
+                pdb.set_trace()
+                print("toto")
+            redisp_[~gen_redi] = 0.
+            act.redispatch = redisp_
         return act
     
     def _update_constraints_param_safe(self, obs):
@@ -721,19 +940,23 @@ def _update_constraints_param_safe(self, obs):
             # storage
             self._add_storage_const(obs, bus_id)
             
-            # curtailment
-            # self.curtail_down.value[bus_id] = 0.
-            # self.curtail_up.value[bus_id] = tmp_[(self.bus_gen.value == bus_id) & obs.gen_renewable].sum()
+            # curtailment #TODO
+            # mask_ = (self.bus_gen.value == bus_id) & obs.gen_renewable
+            # self.curtail_down.value[bus_id] = obs.gen_pmax[mask_].sum() - tmp_[mask_].sum()
+            # self.curtail_up.value[bus_id] = tmp_[mask_].sum()
 
             # storage target
-            self._storage_target_bus.value[bus_id] = self._storage_setpoint[self.bus_storage.value == bus_id].sum()
+            if self.bus_storage is not None:
+                self._storage_target_bus.value[bus_id] = self._storage_setpoint[self.bus_storage.value == bus_id].sum()
             
             # past information
-            self._past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
+            if self.bus_storage is not None:
+                self._past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
             self._past_dispatch.value[bus_id] = obs.target_dispatch[self.bus_gen.value == bus_id].sum()
-            
-        self.curtail_down.value[:] = 0.  # TODO
-        self.curtail_up.value[:] = 0.  # TODO
+        
+        #TODO
+        self.curtail_down.value[:] = 0.
+        self.curtail_up.value[:] = 0.
         
         self._remove_margin_rounding()
     
@@ -747,10 +970,11 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         theta = cp.Variable(shape=self.nb_max_bus)  # at each bus
         curtailment_mw = cp.Variable(shape=self.nb_max_bus)  # at each bus
         storage = cp.Variable(shape=self.nb_max_bus)  # at each bus
-        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus
-        
+        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus            
+            
         # usefull quantities
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
+        f_or_corr = f_or - self._alpha_por_error * self._prev_por_error
         inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
         energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
         
@@ -768,8 +992,8 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
                         [el == 0 for el in KCL_eq] +
                         
                         # I impose here that the flows are bellow the limits
-                        [f_or <= self._margin_th_limit * self._th_lim_mw] +
-                        [f_or >= -self._margin_th_limit * self._th_lim_mw] +
+                        [f_or_corr <= self._margin_th_limit * self._th_lim_mw] +
+                        [f_or_corr >= -self._margin_th_limit * self._th_lim_mw] +
                         
                         # limit redispatching to possible values
                         [redispatching <= self.redisp_up, redispatching >= -self.redisp_down] +
@@ -810,13 +1034,47 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
     
     def act(self,
             obs: BaseObservation,
-            reward: float,
-            done: bool) -> BaseAction:
+            reward: float=1.,
+            done: bool=False) -> BaseAction:
+        """This function is the main method of this class.
+        
+        It is through this function that the agent will take some actions (remember actions
+        concerns only redispatching, curtailment and action on storage units - and powerline reconnection
+        on some cases)
+        
+        It has basically 3 modes:
+        
+        - if the grid is in danger (`obs.rho.max() > self.rho_danger`) it will try to get the grid back to safety
+          (if possible)
+        - if the grid is safe (`obs.rho.max() < self.rho_safe`) it will try to get the grid back to a "reference"
+          state (redispatching at 0., storage units close to `self.storage_setpoint`)
+        - otherwise do nothing (this is mainly to avoid oscillating between the two previous state)
+
+        Parameters
+        ----------
+        obs : BaseObservation
+            The current observation
+        reward : float, optional
+            unused, for compatibility with gym / grid2op agent interface, by default 1.
+        done : bool, optional
+            unused, for compatibility with gym / grid2op agent interface, by default False
+
+        Returns
+        -------
+        BaseAction
+            The action the agent would do
+            
+        """
+        prev_ok = np.isfinite(self.flow_computed)
+        self._prev_por_error.value[prev_ok] = self.flow_computed[prev_ok] - obs.p_or[prev_ok]
+        self._prev_por_error.value[~prev_ok] = 0.
+        # print(f"{np.abs(self._prev_por_error.value).mean()}")
+        # print(f"{np.abs(self._prev_por_error.value).max()}")
         
         self.flow_computed[:] = np.NaN
         if obs.rho.max() > self.rho_danger:
             # I attempt to make the grid more secure
-            
+            self.logger.info(f"step {obs.current_step}, danger mode")
             # update the observation
             self.update_parameters(obs)
             # solve the problem
@@ -827,6 +1085,8 @@ def act(self,
             # I attempt to get back to a more robust state (reconnect powerlines,
             # storage state of charge close to the target state of charge,
             # redispatching close to 0.0 etc.)
+            self.logger.info(f"step {obs.current_step}, safe / recovery mode")
+            
             act = self.action_space()
             
             can_be_reco = (obs.time_before_cooldown_line == 0) & (~obs.line_status)
@@ -845,7 +1105,11 @@ def act(self,
             act = self.to_grid2op(obs, curtailment, storage, redispatching, act)
         else:
             # I do nothing between rho_danger and rho_safe
+            self.logger.info(f"step {obs.current_step}, do nothing mode")
             act = self.action_space()
+            
+            self.flow_computed[:] = obs.p_or
+            
         return act
         
 if __name__ == "__main__":
diff --git a/l2rpn_baselines/test/test_optimcvxpy.py b/l2rpn_baselines/test/test_optimcvxpy.py
index bce804f..317a997 100644
--- a/l2rpn_baselines/test/test_optimcvxpy.py
+++ b/l2rpn_baselines/test/test_optimcvxpy.py
@@ -59,7 +59,8 @@ def _aux_create_env_setup(self, param=None):
         
     def test_unsafe(self):
         env = self._aux_create_env_setup()
-        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85)
+        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85,
+                           alpha_por_error=0.)
         
         obs, reward, done, info = env.step(env.action_space())
         # max rhos of the 3 following step if I do nothing
@@ -91,7 +92,7 @@ def test_unsafe(self):
         
     def test_unsafe_linedisc(self):
         env = self._aux_create_env_setup()
-        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85)
+        agent = OptimCVXPY(env.action_space, env, rho_danger=0., margin_th_limit=0.85, alpha_por_error=0.)
         
         l_id_disc = 4
         obs, reward, done, info = env.step(env.action_space({"set_line_status": [(l_id_disc, -1)]}))
@@ -134,7 +135,8 @@ def test_safe_do_reco(self):
                            env,
                            rho_safe=9.5,
                            rho_danger=10.,
-                           margin_th_limit=0.9)
+                           margin_th_limit=0.9,
+                           alpha_por_error=0.)
         
         l_id_disc = 4
         obs, reward, done, info = env.step(env.action_space({"set_line_status": [(l_id_disc, -1)]}))
@@ -153,7 +155,8 @@ def test_safe_dont_reco_cooldown(self):
                            env,
                            rho_safe=9.5,
                            rho_danger=10.,
-                           margin_th_limit=10.)
+                           margin_th_limit=10.,
+                           alpha_por_error=0.)
         
         l_id_disc = 4
         # a cooldown applies, agent does not reconnect it
@@ -195,7 +198,8 @@ def test_safe_setback_redisp(self):
                            rho_safe=9.5,
                            rho_danger=10.,
                            margin_th_limit=10.0,
-                           weight_storage_target=0.
+                           weight_storage_target=0.,
+                           alpha_por_error=0.
                            )
         act_prev = env.action_space()
         act_prev.redispatch = [3.0, 4.0, 0.0, 0.0, 0.0, -7.0]
@@ -218,7 +222,8 @@ def test_safe_setback_storage(self):
                            rho_safe=9.5,
                            rho_danger=10.,
                            margin_th_limit=10.0,
-                           weight_redisp_target=0.
+                           weight_redisp_target=0.,
+                           alpha_por_error=0.
                            )
         act_prev = env.action_space()
         act_prev.storage_p = [4.9, -9.9]
@@ -247,10 +252,42 @@ def test_safe_setback_storage(self):
             agent._DEBUG = True
             act = agent.act(obs_before, None, None)
             obs, reward, done, info = env.step(act)
-            assert not info["exception"]
-            assert not done
+            assert not info["exception"], f"error at iteration {i}"
+            assert not done, f"error at iteration {i}"
             assert (np.sum((obs.storage_charge - 0.5 * obs.storage_Emax)**2) <= 
                     np.sum((obs_before.storage_charge - 0.5 * obs.storage_Emax)**2)), f"error at iteration {i}"
-            
+    def test_run_dc(self):
+        env = self._aux_create_env_setup()
+        agent = OptimCVXPY(env.action_space,
+                           env,
+                           alpha_por_error=0.,
+                           )
+        obs = env.get_obs()
+        agent.flow_computed[:] = np.NaN
+        agent.run_dc(obs)
+        assert np.all(np.isfinite(agent.flow_computed))
+        init_flow = 1.0 * agent.flow_computed
+        
+        obs, reward, done, info = env.step(env.action_space())
+        agent.flow_computed[:] = np.NaN
+        agent.run_dc(obs)
+        assert np.all(np.isfinite(agent.flow_computed))
+        after_flow = 1.0 * agent.flow_computed
+        assert np.all(init_flow != after_flow)
+    
+    def test_without_storage(self):
+        env_name = "l2rpn_case14_sandbox"
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make(env_name, test=True)
+        obs = env.reset()
+        agent = OptimCVXPY(env.action_space, env, alpha_por_error=0.)
+        conv = agent.run_dc(obs)
+        assert conv
+        act = agent.act(obs, 1.0, False)
+        obs, reward, done, info = env.step(act)
+        act = agent.act(obs, 1.0, False)
+        
+        
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

From a0a32e367b6e904c534eec3cf5fc851872dbb4e2 Mon Sep 17 00:00:00 2001
From: Donnot Benjamin <benjamin.donnot@gmail.com>
Date: Mon, 9 May 2022 07:22:02 +0200
Subject: [PATCH 40/56] improvments in the optimcvxpy agent

---
 Inspect_scenario_timestep.py                  |  2 +-
 .../optimcvxpy_educ_case14_storage.py         | 94 ++++++++++---------
 .../ppo_stable_baselines/B_train_agent.py     |  6 +-
 l2rpn_baselines/OptimCVXPY/__init__.py        |  4 +-
 l2rpn_baselines/OptimCVXPY/make_agent.py      | 42 +++++++++
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py      | 40 ++++----
 l2rpn_baselines/utils/gymenv_custom.py        |  1 +
 7 files changed, 126 insertions(+), 63 deletions(-)
 create mode 100644 l2rpn_baselines/OptimCVXPY/make_agent.py

diff --git a/Inspect_scenario_timestep.py b/Inspect_scenario_timestep.py
index f750f68..89bc37d 100644
--- a/Inspect_scenario_timestep.py
+++ b/Inspect_scenario_timestep.py
@@ -34,4 +34,4 @@
 #plot observation
 plot_helper = PlotMatplot(env.observation_space)
 fig_obs = plot_helper.plot_obs(new_obs)
-fig_obs.show()
\ No newline at end of file
+fig_obs.show()
diff --git a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
index e6fde08..a95c0ce 100644
--- a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
+++ b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
@@ -14,50 +14,58 @@
 from tqdm import tqdm
 import pdb
 
-env = grid2op.make("educ_case14_storage", test=True, backend=LightSimBackend(),
-                   action_class=PlayableAction)
+if __name__ == "__main__":
+    env = grid2op.make("educ_case14_storage",
+                       test=True,
+                       backend=LightSimBackend(),
+                       action_class=PlayableAction)
 
-agent = OptimCVXPY(env.action_space,
-                   env,
-                   penalty_redispatching_unsafe=0.,
-                   penalty_storage_unsafe=0.1,
-                   penalty_curtailment_unsafe=0.01,
-                   rho_safe=0.85,
-                   rho_danger=0.9,
-                   margin_th_limit=0.93,
-                   alpha_por_error=0.5,
-                   weight_redisp_target=0.,
-                   )
+    agent = OptimCVXPY(env.action_space,
+                       env,
+                       penalty_redispatching_unsafe=0.,
+                       penalty_storage_unsafe=0.1,
+                       penalty_curtailment_unsafe=0.01,
+                       rho_safe=0.95,
+                       rho_danger=0.97,
+                       margin_th_limit=0.95,
+                       alpha_por_error=0.5,
+                       weight_redisp_target=0.3,
+                       )
 
-# in safe / recovery mode agent tries to fill the storage units as much as possible
-agent.storage_setpoint = env.storage_Emax  
+    # in safe / recovery mode agent tries to fill the storage units as much as possible
+    agent.storage_setpoint = env.storage_Emax  
 
-print("For do nothing: ")
-dn_act = env.action_space()
-for scen_id in range(7):
-    env.set_id(scen_id)
-    obs = env.reset()
-    done = False
-    for nb_step in tqdm(range(287)):
-        obs, reward, done, info = env.step(dn_act)
-        if done:
-            break
-    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 287")
+    # print("For do nothing: ")
+    # dn_act = env.action_space()
+    # for scen_id in range(7):
+    #     env.set_id(scen_id)
+    #     obs = env.reset()
+    #     done = False
+    #     for nb_step in tqdm(range(288)):
+    #         obs, reward, done, info = env.step(dn_act)
+    #         if done:
+    #             break
+    #     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 288")
 
-print("For the optimizer: ")
-for scen_id in range(7):
-    # if scen_id != 2:
-        # continue
-    
-    env.set_id(scen_id)
-    obs = env.reset()
-    agent.reset(obs)
-    done = False
-    for nb_step in tqdm(range(287)):
-        prev_obs = obs
-        act = agent.act(obs)
-        obs, reward, done, info = env.step(act)
-        if done:
-            print(prev_obs.storage_charge)
-            break
-    print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 287")
+    print("For the optimizer: ")
+    for scen_id in range(7):
+        # if scen_id != 2:
+            # continue
+        
+        env.set_id(scen_id)
+        obs = env.reset()
+        agent.reset(obs)
+        done = False
+        for nb_step in tqdm(range(288)):
+            prev_obs = obs
+            # agent._DEBUG = nb_step >= 22
+            # agent._DEBUG = nb_step >= 10
+            # agent._DEBUG = nb_step >= 190
+            act = agent.act(obs)
+            obs, reward, done, info = env.step(act)
+            if done:
+                # print(info)
+                # print(prev_obs.storage_charge)
+                # print(prev_obs.target_dispatch)
+                break
+        print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 288")
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 3c82921..98a3984 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -16,11 +16,13 @@
 import re
 import numpy as np
 from grid2op.Reward import BaseReward
+from grid2op.Action import PlayableAction
 from l2rpn_baselines.utils import GymEnvWithReco, GymEnvWithRecoWithDN
 
 env_name = "l2rpn_icaps_2021_small_train"
 env_name = "l2rpn_wcci_2022_dev_train"
 env_name = "wcci_2022_dev_2"
+env_name = "l2rpn_case14_sandbox"
 save_path = "./saved_model"
 name = "expe_GymEnvWithRecoWithDN_2022_test5"
 gymenv_class = GymEnvWithRecoWithDN  # uses the heuristic to do nothing is the grid is not at risk and to reconnect powerline automatically
@@ -133,6 +135,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     gamma = 0.999
     
     env = grid2op.make(env_name,
+                       action_class=PlayableAction,
                        reward_class=CustomReward,
                        backend=LightSimBackend(),
                        chronics_class=MultifolderWithCache)
@@ -147,7 +150,8 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: True)
-    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-08-01_.*$", x) is not None)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*500$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-08-01_.*$", x) is not None)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
diff --git a/l2rpn_baselines/OptimCVXPY/__init__.py b/l2rpn_baselines/OptimCVXPY/__init__.py
index 34e8275..e8ea384 100644
--- a/l2rpn_baselines/OptimCVXPY/__init__.py
+++ b/l2rpn_baselines/OptimCVXPY/__init__.py
@@ -8,8 +8,10 @@
 
 __all__ = [
     "evaluate",
-    "OptimCVXPY"
+    "OptimCVXPY",
+    "make_agent",
 ]
 
 from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
 from l2rpn_baselines.OptimCVXPY.evaluate import evaluate
+from l2rpn_baselines.OptimCVXPY.make_agent import make_agent
diff --git a/l2rpn_baselines/OptimCVXPY/make_agent.py b/l2rpn_baselines/OptimCVXPY/make_agent.py
new file mode 100644
index 0000000..5748526
--- /dev/null
+++ b/l2rpn_baselines/OptimCVXPY/make_agent.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+import os
+from grid2op.Environment import Environment
+from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
+
+
+def make_agent(env: Environment, dir_path: os.PathLike) -> OptimCVXPY:
+    """First example of the function you will need to provide
+    to send your agent to l2rpn competitions or
+    to use your agent in grid2game.
+
+    Parameters
+    ----------
+    env : Environment
+        _description_
+    dir_path : os.PathLike
+        _description_
+
+    Returns
+    -------
+    OptimCVXPY
+        _description_
+    """
+    # TODO read the parameters from a config file !
+    agent = OptimCVXPY(env.action_space,
+                       env,
+                       penalty_redispatching_unsafe=0.,
+                       penalty_storage_unsafe=0.1,
+                       penalty_curtailment_unsafe=0.01,
+                       rho_safe=0.85,
+                       rho_danger=0.9,
+                       margin_th_limit=0.93,
+                       alpha_por_error=0.5,
+                       weight_redisp_target=0.,)
+    
+    return agent
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index 905e142..a23eaf8 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -31,6 +31,9 @@
 # TODO have the agent "play" with the protection: if a powerline is not in danger,
 # the margin_th_limit associated should be larger than if a powerline is close to be disconnected
 
+# TODO add margin in the generator not to put them at pmin / pmax !
+# TODO "remove" past actions for the storage, curtailment: 
+# redispatching continues in time, storage is a "one time thing"
 class OptimCVXPY(BaseAgent):
     """
     This agent choses its action by resolving, at each `agent.act(...)` call an optimization routine
@@ -308,6 +311,7 @@ def __init__(self,
                                                            nonneg=True,
                                                            )
         self.nb_max_bus: int = 2 * env.n_sub
+        self._storage_power_obs: cp.Parameter = cp.Parameter(value=0.)
         
         SoC = np.zeros(shape=self.nb_max_bus)
         self._storage_setpoint: np.ndarray = 0.5 * env.storage_Emax
@@ -527,15 +531,20 @@ def _update_th_lim_param(self, obs: BaseObservation):
         
         # TODO what if (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2 is negative !
         
+    def _update_storage_power_obs(self, obs: BaseObservation):
+        self._storage_power_obs.value += obs.storage_power.sum()
+        
     def _update_inj_param(self, obs: BaseObservation):
+        self._update_storage_power_obs(obs)
+        
         self.load_per_bus.value[:] = 0.
         self.gen_per_bus.value[:] = 0.
         load_p = 1.0 * obs.load_p
-        load_p *= (obs.gen_p.sum() - obs.storage_power.sum()) / load_p.sum() 
+        load_p *=(obs.gen_p.sum() - self._storage_power_obs.value) / load_p.sum() 
         for bus_id in range(self.nb_max_bus):
             self.load_per_bus.value[bus_id] += load_p[self.bus_load.value == bus_id].sum()
-            if self.bus_storage is not None:
-                self.load_per_bus.value[bus_id] += obs.storage_power[self.bus_storage.value == bus_id].sum()
+            # if self.bus_storage is not None:
+                # self.load_per_bus.value[bus_id] += obs.storage_power[self.bus_storage.value == bus_id].sum()
             self.gen_per_bus.value[bus_id] += obs.gen_p[self.bus_gen.value == bus_id].sum()
 
     def _add_redisp_const(self, obs: BaseObservation, bus_id: int):
@@ -726,7 +735,7 @@ def compute_optimum_unsafe(self):
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
         f_or_corr = f_or - self._alpha_por_error * self._prev_por_error
         inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
-        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
+        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching) - self._storage_power_obs
         
         KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
         theta_is_zero = self._mask_theta_zero()
@@ -764,6 +773,7 @@ def compute_optimum_unsafe(self):
         if has_converged:
             self.flow_computed[:] = f_or.value
             res = (curtailment_mw.value, storage.value, redispatching.value)
+            self._storage_power_obs.value = 0.
         else:
             self.logger.error(f"Problem with the optimization for all tested solvers ({type(self).SOLVER_TYPES})")
             self.flow_computed[:] = np.NaN
@@ -870,7 +880,7 @@ def to_grid2op(self,
             curtailment_[gen_curt] = aux_
             curtailment_[~gen_curt] = -1.
             act.curtail_mw = curtailment_
-        
+            
         # redispatching
         if np.any(np.abs(redispatching) > 0.):
             redisp_ = np.zeros(obs.n_gen)
@@ -895,10 +905,10 @@ def to_grid2op(self,
             prop_to_gen = np.zeros(obs.n_gen)
             redisp_up = np.zeros(obs.n_gen, dtype=bool)
             redisp_up[gen_redi] = tmp_ > 0.
-            prop_to_gen[redisp_up] = obs.gen_max_ramp_up[redisp_up]
+            prop_to_gen[redisp_up] = obs.gen_margin_up[redisp_up]
             redisp_down = np.zeros(obs.n_gen, dtype=bool)
             redisp_down[gen_redi] = tmp_ < 0.
-            prop_to_gen[redisp_down] = obs.gen_max_ramp_down[redisp_down]
+            prop_to_gen[redisp_down] = obs.gen_margin_down[redisp_down]
             
             # avoid numeric issues
             nothing_happens = (redisp_avail[idx_gen] == 0.) & (prop_to_gen[gen_redi] == 0.)
@@ -918,14 +928,7 @@ def to_grid2op(self,
                 redisp_avail[idx_gen] = this_fix_
                 
             # Now I split the output of the optimization between the generators
-            try:
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("error")
-                    redisp_[gen_redi] = tmp_ * prop_to_gen[gen_redi] / redisp_avail[idx_gen]
-            except Exception as exc_:
-                print("a warning occured")
-                pdb.set_trace()
-                print("toto")
+            redisp_[gen_redi] = tmp_ * prop_to_gen[gen_redi] / redisp_avail[idx_gen]
             redisp_[~gen_redi] = 0.
             act.redispatch = redisp_
         return act
@@ -976,7 +979,7 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
         f_or_corr = f_or - self._alpha_por_error * self._prev_por_error
         inj_bus = (self.load_per_bus + storage) - (self.gen_per_bus + redispatching - curtailment_mw)
-        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching)
+        energy_added = cp.sum(curtailment_mw) + cp.sum(storage) - cp.sum(redispatching) - self._storage_power_obs
         
         KCL_eq = self._aux_compute_kcl(inj_bus, f_or)
         theta_is_zero = self._mask_theta_zero()
@@ -1021,9 +1024,11 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         # solve
         prob = cp.Problem(cp.Minimize(cost), constraints)
         has_converged = self._solve_problem(prob)
+            
         if has_converged:
             self.flow_computed[:] = f_or.value
             res = (curtailment_mw.value, storage.value, redispatching.value)
+            self._storage_power_obs.value = 0.
         else:
             self.logger.error(f"Problem with the optimization for all tested solvers ({type(self).SOLVER_TYPES})")
             self.flow_computed[:] = np.NaN
@@ -1070,6 +1075,7 @@ def act(self,
         self._prev_por_error.value[~prev_ok] = 0.
         # print(f"{np.abs(self._prev_por_error.value).mean()}")
         # print(f"{np.abs(self._prev_por_error.value).max()}")
+        # print(f"step {obs.current_step} target dispatch: {obs.target_dispatch.sum():.2f} / {obs.storage_power.sum():.2f}")
         
         self.flow_computed[:] = np.NaN
         if obs.rho.max() > self.rho_danger:
@@ -1106,10 +1112,10 @@ def act(self,
         else:
             # I do nothing between rho_danger and rho_safe
             self.logger.info(f"step {obs.current_step}, do nothing mode")
+            self._update_storage_power_obs(obs)
             act = self.action_space()
             
             self.flow_computed[:] = obs.p_or
-            
         return act
         
 if __name__ == "__main__":
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index bd4f5f6..e6b2fe5 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -221,6 +221,7 @@ def step(self, gym_action):
         if not done:
             g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
         gym_obs = self.observation_space.to_gym(g2op_obs)
+        print(gym_obs.min(), gym_obs.max())
         return gym_obs, float(reward), done, info
         
     def reset(self, seed=None, return_info=False, options=None):

From c89bae5da0fe64b3e9c9e715ab2ce220924f878a Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Mon, 9 May 2022 09:05:17 +0200
Subject: [PATCH 41/56] before conflict

---
 examples/optim_cvxpy/optimcvxpy_wcci_2022.py |  2 +-
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py     | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/optim_cvxpy/optimcvxpy_wcci_2022.py b/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
index 6e7e46b..b207cdd 100644
--- a/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
+++ b/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
@@ -41,7 +41,7 @@
              "2050-11-14_31",
              "2050-12-19_31",
              ]
-scen_test = ["2050-02-21_31"]
+# scen_test = ["2050-02-21_31"]
 print("For do nothing: ")
 dn_act = env.action_space()
 for scen_id in scen_test:
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index 905e142..5c8e43f 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -714,7 +714,12 @@ def run_dc(self, obs: BaseObservation):
             self.flow_computed[:] = np.NaN
             
         return has_converged
-        
+    
+    def max_curtailment(self, obs):
+        # TODO find the maximum curtailment i can do without damaging the grid
+        # merge it with compute_optimum_safe(self, ...)
+        pass
+         
     def compute_optimum_unsafe(self):
         # variables
         theta = cp.Variable(shape=self.nb_max_bus)  # at each bus
@@ -918,14 +923,7 @@ def to_grid2op(self,
                 redisp_avail[idx_gen] = this_fix_
                 
             # Now I split the output of the optimization between the generators
-            try:
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("error")
-                    redisp_[gen_redi] = tmp_ * prop_to_gen[gen_redi] / redisp_avail[idx_gen]
-            except Exception as exc_:
-                print("a warning occured")
-                pdb.set_trace()
-                print("toto")
+            redisp_[gen_redi] = tmp_ * prop_to_gen[gen_redi] / redisp_avail[idx_gen]
             redisp_[~gen_redi] = 0.
             act.redispatch = redisp_
         return act

From 700bac8cf16fa553457e475aa6892301461dcec1 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Mon, 9 May 2022 18:43:05 +0200
Subject: [PATCH 42/56] starting to fix the issue about normalization

---
 examples/ppo_stable_baselines/A_prep_env.py   | 21 ++++++++++++
 .../ppo_stable_baselines/B_train_agent.py     | 14 +++++++-
 .../C_evaluate_trained_model.py               | 22 ++++++++++---
 l2rpn_baselines/PPO_SB3/evaluate.py           |  2 ++
 l2rpn_baselines/PPO_SB3/train.py              | 32 ++++++++++++++++---
 l2rpn_baselines/utils/gymenv_custom.py        |  9 +++++-
 6 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index 6631b39..39383e1 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -22,6 +22,7 @@
 
 env_name = "l2rpn_icaps_2021_small"
 env_name = "l2rpn_wcci_2022_dev"
+env_name = "wcci_2022_dev"
 SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
 
 name_stats = "_reco_powerline"
@@ -132,3 +133,23 @@ def get_env_seed(env_name: str):
         stats_reco.compute(nb_scenario=nb_scenario,
                            agent=reco_powerline_agent,
                            env_seeds=seeds)
+        
+        if nm_ == nm_val:
+            # save the normalization parameters from the validation set
+            dict_ = {"subtract": {}, 'divide': {}}
+            for attr_nm in ["gen_p", "load_p", "p_or", "rho"]:
+                avg_ = stats_reco.get(attr_nm)[0].mean(axis=0)
+                std_ = stats_reco.get(attr_nm)[0].std(axis=0)
+                dict_["subtract"][attr_nm] = [float(el) for el in avg_]
+                dict_["divide"][attr_nm] = [max(float(el), 1.0) for el in std_]
+            
+            with open("preprocess_obs.json", "w", encoding="utf-8") as f:
+                json.dump(obj=dict_, fp=f)
+                
+            act_space_kwargs = {"add": {"redispatch": [TODO],
+                                             "set_storage": [TODO]},
+                                'multiply': {"redispatch": [TODO],
+                                           "set_storage": [TODO]}
+                               }
+            with open("preprocess_act.json", "w", encoding="utf-8") as f:
+                json.dump(obj=dict_, fp=f)
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 98a3984..4d96fab 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -14,6 +14,7 @@
 
 import os
 import re
+import json
 import numpy as np
 from grid2op.Reward import BaseReward
 from grid2op.Action import PlayableAction
@@ -22,6 +23,7 @@
 env_name = "l2rpn_icaps_2021_small_train"
 env_name = "l2rpn_wcci_2022_dev_train"
 env_name = "wcci_2022_dev_2"
+env_name = "wcci_2022_dev"
 env_name = "l2rpn_case14_sandbox"
 save_path = "./saved_model"
 name = "expe_GymEnvWithRecoWithDN_2022_test5"
@@ -125,9 +127,9 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                         # curtailment part of the observation
                         "curtailment", "curtailment_limit",  "gen_p_before_curtail",
                         ]
+    TODO = ...
     # same here you can change it as you please
     act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
-    
     # parameters for the learning
     nb_iter = 300_000
     learning_rate = 3e-4
@@ -139,6 +141,14 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        reward_class=CustomReward,
                        backend=LightSimBackend(),
                        chronics_class=MultifolderWithCache)
+    
+    # with open("preprocess_obs.json", "r", encoding="utf-8") as f:
+    #     obs_space_kwargs = json.load(f)
+    # with open("preprocess_act.json", "r", encoding="utf-8") as f:
+    #     act_space_kwargs = json.load(f)
+    
+    obs_space_kwargs = None
+    act_space_kwargs = None
     param = env.parameters
     param.LIMIT_INFEASIBLE_CURTAILMENT_STORAGE_ACTION = True
     env.change_parameters(param)
@@ -163,7 +173,9 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
             logs_dir="./logs",
             save_path=save_path, 
             obs_attr_to_keep=obs_attr_to_keep,
+            obs_space_kwargs=obs_space_kwargs,
             act_attr_to_keep=act_attr_to_keep,
+            act_space_kwargs=act_space_kwargs,
             normalize_act=True,
             normalize_obs=True,
             name=name,
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 390eeb6..8acab04 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -6,7 +6,7 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-from tabnanny import verbose
+import json
 import numpy as np
 
 import grid2op
@@ -33,14 +33,18 @@
 
 def load_agent(env, load_path, name,
                gymenv_class=gymenv_class,
-               gymenv_kwargs={"safe_max_rho": safe_max_rho}):
+               gymenv_kwargs={"safe_max_rho": safe_max_rho},
+               obs_space_kwargs=None,
+               act_space_kwargs=None):
     trained_agent, _ = evaluate(env,
                                 nb_episode=0,
                                 load_path=load_path,
                                 name=name,
                                 gymenv_class=gymenv_class,
                                 iter_num=iter_num,
-                                gymenv_kwargs=gymenv_kwargs)
+                                gymenv_kwargs=gymenv_kwargs,
+                                obs_space_kwargs=obs_space_kwargs,
+                                act_space_kwargs=act_space_kwargs)
     return trained_agent
 
 
@@ -82,7 +86,17 @@ def get_ts_survived_reco(env_name):
                          nb_process_stats=nb_process_stats,
                          )
 
-    my_agent = load_agent(env_val, load_path=load_path, name=agent_name, gymenv_class=gymenv_class)
+    with open("preprocess_obs.json", "r", encoding="utf-8") as f:
+        obs_space_kwargs = json.load(f)
+    with open("preprocess_act.json", "r", encoding="utf-8") as f:
+        act_space_kwargs = json.load(f)
+        
+    my_agent = load_agent(env_val,
+                          load_path=load_path,
+                          name=agent_name,
+                          gymenv_class=gymenv_class,
+                          obs_space_kwargs=obs_space_kwargs,
+                          act_space_kwargs=act_space_kwargs)
     _, ts_survived, _ = my_score.get(my_agent)
     
     # compare with do nothing
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index 5dc6aff..2157dc6 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -27,6 +27,8 @@ def evaluate(env,
              save_gif=False,
              gymenv_class=GymEnv,
              gymenv_kwargs=None,
+             obs_space_kwargs=None,  # TODO
+             act_space_kwargs=None,  # TODO
              iter_num=None,
              **kwargs):
     """
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index ecad643..ddb0070 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -49,7 +49,9 @@ def train(env,
           save_every_xxx_steps=None,
           model_policy=MlpPolicy,
           obs_attr_to_keep=copy.deepcopy(default_obs_attr_to_keep),
+          obs_space_kwargs=None,
           act_attr_to_keep=copy.deepcopy(default_act_attr_to_keep),
+          act_space_kwargs=None,
           policy_kwargs=None,
           normalize_obs=False,
           normalize_act=False,
@@ -74,7 +76,7 @@ def train(env,
     Parameters
     ----------
     env: :class:`grid2op.Environment`
-        Then environment on which you need to train your agent.
+        The environment on which you need to train your agent.
 
     name: ``str```
         The name of your agent.
@@ -113,12 +115,18 @@ def train(env,
         as the "attr_to_keep" value of the
         BoxObservation space (see
         https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymObsSpace)
+        
+    obs_space_kwargs:
+        Extra kwargs to build the BoxGymObsSpace (**NOT** saved then NOT restored)
 
     act_attr_to_keep: list of string
         Grid2op attribute to use to build the BoxGymActSpace. It is passed
         as the "attr_to_keep" value of the
         BoxAction space (see
         https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymActSpace)
+        
+    act_space_kwargs:
+        Extra kwargs to build the BoxGymActSpace (**NOT** saved then NOT restored)
 
     verbose: ``bool``
         If you want something to be printed on the terminal (a better logging strategy will be put at some point)
@@ -135,7 +143,7 @@ def train(env,
         The class to use as a gym environment. By default `GymEnv` (from module grid2op.gym_compat)
     
     gymenv_kwargs: ``dict``
-        Extra key words arguments to build the gym environment.
+        Extra key words arguments to build the gym environment., **NOT** saved / restored by this class
         
     policy_kwargs: ``dict``
         extra parameters passed to the PPO "policy_kwargs" key word arguments
@@ -210,11 +218,17 @@ def train(env,
         gymenv_kwargs = {}
     env_gym = gymenv_class(env, **gymenv_kwargs)
     env_gym.observation_space.close()
+    if obs_space_kwargs is None:
+        obs_space_kwargs = {}
     env_gym.observation_space = BoxGymObsSpace(env.observation_space,
-                                               attr_to_keep=obs_attr_to_keep)
+                                               attr_to_keep=obs_attr_to_keep,
+                                               **obs_space_kwargs)
     env_gym.action_space.close()
+    if act_space_kwargs is None:
+        act_space_kwargs = {}
     env_gym.action_space = BoxGymActSpace(env.action_space,
-                                          attr_to_keep=act_attr_to_keep)
+                                          attr_to_keep=act_attr_to_keep,
+                                          **act_space_kwargs)
 
     if normalize_act:
         if save_path is not None:
@@ -222,6 +236,11 @@ def train(env,
                       mode="w") as f:
                 f.write("I have encoded the action space !\n DO NOT MODIFY !")
         for attr_nm in act_attr_to_keep:
+            if (("multiply" in act_attr_to_keep and attr_nm in act_attr_to_keep["divide"]) or 
+                ("subtract" in act_attr_to_keep and attr_nm in act_attr_to_keep["subtract"]) 
+               ):
+                # attribute is scaled elsewhere
+                continue
             env_gym.action_space.normalize_attr(attr_nm)
 
     if normalize_obs:
@@ -230,6 +249,11 @@ def train(env,
                       mode="w") as f:
                 f.write("I have encoded the observation space !\n DO NOT MODIFY !")
         for attr_nm in obs_attr_to_keep:
+            if (("divide" in obs_attr_to_keep and attr_nm in obs_space_kwargs["divide"]) or 
+                ("subtract" in obs_attr_to_keep and attr_nm in obs_space_kwargs["subtract"]) 
+               ):
+                # attribute is scaled elsewhere
+                continue
             env_gym.observation_space.normalize_attr(attr_nm)
     
     # Save a checkpoint every "save_every_xxx_steps" steps
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index e6b2fe5..7b6a19c 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -221,7 +221,14 @@ def step(self, gym_action):
         if not done:
             g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
         gym_obs = self.observation_space.to_gym(g2op_obs)
-        print(gym_obs.min(), gym_obs.max())
+        # print(gym_obs.min(), gym_obs.max())
+        # print(gym_obs)
+        import pdb
+        pdb.set_trace()
+        for attr_nm in self.observation_space._attr_to_keep:
+            print(f"{attr_nm} {self.observation_space._handle_attribute(g2op_obs, attr_nm)}")
+            
+        raise RuntimeError("stop")
         return gym_obs, float(reward), done, info
         
     def reset(self, seed=None, return_info=False, options=None):

From 5057456ee503f85f5ddef31ccf1fbc8724c0ce17 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Tue, 10 May 2022 13:39:53 +0200
Subject: [PATCH 43/56] improve the examples to better normalize the action /
 observation

---
 .gitignore                                    |  2 ++
 examples/ppo_stable_baselines/A_prep_env.py   | 10 ++++----
 .../ppo_stable_baselines/B_train_agent.py     | 23 ++++++++++---------
 .../C_evaluate_trained_model.py               |  1 +
 l2rpn_baselines/PPO_SB3/evaluate.py           | 16 +++++++++++--
 l2rpn_baselines/PPO_SB3/train.py              | 11 ++++-----
 l2rpn_baselines/utils/gymenv_custom.py        |  8 -------
 7 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9481b54..9f0db40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -191,3 +191,5 @@ tensorboard/
 test_sac/
 documentation/
 test_issue_glop.py
+examples/ppo_stable_baselines/preprocess_act.json
+examples/ppo_stable_baselines/preprocess_obs.json
diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index 39383e1..3297730 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -146,10 +146,10 @@ def get_env_seed(env_name: str):
             with open("preprocess_obs.json", "w", encoding="utf-8") as f:
                 json.dump(obj=dict_, fp=f)
                 
-            act_space_kwargs = {"add": {"redispatch": [TODO],
-                                             "set_storage": [TODO]},
-                                'multiply': {"redispatch": [TODO],
-                                           "set_storage": [TODO]}
+            act_space_kwargs = {"add": {"redispatch": [0. for gen_id in range(env.n_gen) if env.gen_redispatchable[gen_id]],
+                                        "set_storage": [0. for _ in range(env.n_storage)]},
+                                'multiply': {"redispatch": [1. / (max(float(el), 1.0)) for gen_id, el in enumerate(env.gen_max_ramp_up) if env.gen_redispatchable[gen_id]],
+                                             "set_storage": [1. / (max(float(el), 1.0)) for el in env.storage_max_p_prod]}
                                }
             with open("preprocess_act.json", "w", encoding="utf-8") as f:
-                json.dump(obj=dict_, fp=f)
+                json.dump(obj=act_space_kwargs, fp=f)
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 4d96fab..4942def 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -23,10 +23,11 @@
 env_name = "l2rpn_icaps_2021_small_train"
 env_name = "l2rpn_wcci_2022_dev_train"
 env_name = "wcci_2022_dev_2"
-env_name = "wcci_2022_dev"
-env_name = "l2rpn_case14_sandbox"
+env_name = "wcci_2022_dev_train"
+# env_name = "l2rpn_case14_sandbox"
 save_path = "./saved_model"
-name = "expe_GymEnvWithRecoWithDN_2022_test5"
+name = "test_normalize_features"
+name = "test_1"
 gymenv_class = GymEnvWithRecoWithDN  # uses the heuristic to do nothing is the grid is not at risk and to reconnect powerline automatically
 max_iter = 7 * 24 * 12  # None to deactivate it
 safe_max_rho = 0.9  # the grid is said "safe" if the rho is lower than this value, it is a really important parameter to tune !
@@ -34,11 +35,12 @@
 
 # customize the reward function (optional)
 class CustomReward(BaseReward):
-    def __init__(self):
+    def __init__(self, logger=None):
         """
         Initializes :attr:`BaseReward.reward_min` and :attr:`BaseReward.reward_max`
 
         """
+        BaseReward.__init__(self, logger=logger)
         self.reward_min = 0.
         self.reward_max = 1.
         self._min_rho = 0.90
@@ -142,13 +144,11 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
                        backend=LightSimBackend(),
                        chronics_class=MultifolderWithCache)
     
-    # with open("preprocess_obs.json", "r", encoding="utf-8") as f:
-    #     obs_space_kwargs = json.load(f)
-    # with open("preprocess_act.json", "r", encoding="utf-8") as f:
-    #     act_space_kwargs = json.load(f)
+    with open("preprocess_obs.json", "r", encoding="utf-8") as f:
+        obs_space_kwargs = json.load(f)
+    with open("preprocess_act.json", "r", encoding="utf-8") as f:
+        act_space_kwargs = json.load(f)
     
-    obs_space_kwargs = None
-    act_space_kwargs = None
     param = env.parameters
     param.LIMIT_INFEASIBLE_CURTAILMENT_STORAGE_ACTION = True
     env.change_parameters(param)
@@ -160,8 +160,9 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: True)
-    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*500$", x) is not None)
+    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*500$", x) is not None)
     # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-08-01_.*$", x) is not None)
+    env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-02-.*$", x) is not None)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
     # for more information !
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 8acab04..4020d6d 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -21,6 +21,7 @@
 
 env_name = "l2rpn_icaps_2021_small_val"
 env_name = "l2rpn_wcci_2022_dev_val"
+env_name = "wcci_2022_dev_val"
 SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
 
 agent_name = name
diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index 2157dc6..ef18f6d 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -163,15 +163,27 @@ def evaluate(env,
         act_attr_to_keep = json.load(fp=f)
 
     # create the action and observation space
-    gym_observation_space =  BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
-    gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)
+    gym_observation_space =  BoxGymObsSpace(env.observation_space,
+                                            attr_to_keep=obs_attr_to_keep,
+                                            **obs_space_kwargs)
+    gym_action_space = BoxGymActSpace(env.action_space,
+                                      attr_to_keep=act_attr_to_keep,
+                                      **act_space_kwargs)
     
     if os.path.exists(os.path.join(load_path, ".normalize_act")):
         for attr_nm in act_attr_to_keep:
+            if (("multiply" in act_space_kwargs and attr_nm in act_space_kwargs["multiply"]) or 
+                ("add" in act_space_kwargs and attr_nm in act_space_kwargs["add"]) 
+               ):
+                continue
             gym_action_space.normalize_attr(attr_nm)
 
     if os.path.exists(os.path.join(load_path, ".normalize_obs")):
         for attr_nm in obs_attr_to_keep:
+            if (("divide" in obs_space_kwargs and attr_nm in obs_space_kwargs["divide"]) or 
+                ("subtract" in obs_space_kwargs and attr_nm in obs_space_kwargs["subtract"]) 
+               ):
+                continue
             gym_observation_space.normalize_attr(attr_nm)
     
     gymenv = None
diff --git a/l2rpn_baselines/PPO_SB3/train.py b/l2rpn_baselines/PPO_SB3/train.py
index ddb0070..a8fa92c 100644
--- a/l2rpn_baselines/PPO_SB3/train.py
+++ b/l2rpn_baselines/PPO_SB3/train.py
@@ -6,8 +6,6 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
-import pdb
-from tabnanny import verbose
 import warnings
 import copy
 import os
@@ -57,6 +55,7 @@ def train(env,
           normalize_act=False,
           gymenv_class=GymEnv,
           gymenv_kwargs=None,
+          verbose=True,
           seed=None,  # TODO
           eval_env=None,  # TODO
           **kwargs):
@@ -236,8 +235,8 @@ def train(env,
                       mode="w") as f:
                 f.write("I have encoded the action space !\n DO NOT MODIFY !")
         for attr_nm in act_attr_to_keep:
-            if (("multiply" in act_attr_to_keep and attr_nm in act_attr_to_keep["divide"]) or 
-                ("subtract" in act_attr_to_keep and attr_nm in act_attr_to_keep["subtract"]) 
+            if (("multiply" in act_space_kwargs and attr_nm in act_space_kwargs["multiply"]) or 
+                ("add" in act_space_kwargs and attr_nm in act_space_kwargs["add"]) 
                ):
                 # attribute is scaled elsewhere
                 continue
@@ -249,8 +248,8 @@ def train(env,
                       mode="w") as f:
                 f.write("I have encoded the observation space !\n DO NOT MODIFY !")
         for attr_nm in obs_attr_to_keep:
-            if (("divide" in obs_attr_to_keep and attr_nm in obs_space_kwargs["divide"]) or 
-                ("subtract" in obs_attr_to_keep and attr_nm in obs_space_kwargs["subtract"]) 
+            if (("divide" in obs_space_kwargs and attr_nm in obs_space_kwargs["divide"]) or 
+                ("subtract" in obs_space_kwargs and attr_nm in obs_space_kwargs["subtract"]) 
                ):
                 # attribute is scaled elsewhere
                 continue
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index 7b6a19c..bd4f5f6 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -221,14 +221,6 @@ def step(self, gym_action):
         if not done:
             g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
         gym_obs = self.observation_space.to_gym(g2op_obs)
-        # print(gym_obs.min(), gym_obs.max())
-        # print(gym_obs)
-        import pdb
-        pdb.set_trace()
-        for attr_nm in self.observation_space._attr_to_keep:
-            print(f"{attr_nm} {self.observation_space._handle_attribute(g2op_obs, attr_nm)}")
-            
-        raise RuntimeError("stop")
         return gym_obs, float(reward), done, info
         
     def reset(self, seed=None, return_info=False, options=None):

From 01c9de6baa3118800c611fca57fddf7fc261a091 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Tue, 10 May 2022 14:29:02 +0200
Subject: [PATCH 44/56] finding good parameters for the case14 storage

---
 .../optimcvxpy_educ_case14_storage.py         | 34 ++++++++-----------
 examples/ppo_stable_baselines/ReadMe.md       |  3 +-
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
index a95c0ce..f004517 100644
--- a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
+++ b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
@@ -14,6 +14,8 @@
 from tqdm import tqdm
 import pdb
 
+max_step = 288
+
 if __name__ == "__main__":
     env = grid2op.make("educ_case14_storage",
                        test=True,
@@ -23,11 +25,11 @@
     agent = OptimCVXPY(env.action_space,
                        env,
                        penalty_redispatching_unsafe=0.,
-                       penalty_storage_unsafe=0.1,
+                       penalty_storage_unsafe=0.04,  ###
                        penalty_curtailment_unsafe=0.01,
-                       rho_safe=0.95,
-                       rho_danger=0.97,
-                       margin_th_limit=0.95,
+                       rho_safe=0.95,  ###
+                       rho_danger=0.97,  ###
+                       margin_th_limit=0.93,  ###
                        alpha_por_error=0.5,
                        weight_redisp_target=0.3,
                        )
@@ -41,31 +43,23 @@
     #     env.set_id(scen_id)
     #     obs = env.reset()
     #     done = False
-    #     for nb_step in tqdm(range(288)):
+    #     for nb_step in tqdm(range(max_step)):
     #         obs, reward, done, info = env.step(dn_act)
-    #         if done:
+            # if done and nb_step != (max_step-1):
     #             break
-    #     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 288")
+    #     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {max_step}")
 
     print("For the optimizer: ")
-    for scen_id in range(7):
-        # if scen_id != 2:
-            # continue
-        
+    for scen_id in range(7):        
         env.set_id(scen_id)
         obs = env.reset()
         agent.reset(obs)
         done = False
-        for nb_step in tqdm(range(288)):
+        for nb_step in tqdm(range(max_step)):
             prev_obs = obs
-            # agent._DEBUG = nb_step >= 22
-            # agent._DEBUG = nb_step >= 10
-            # agent._DEBUG = nb_step >= 190
             act = agent.act(obs)
             obs, reward, done, info = env.step(act)
-            if done:
-                # print(info)
-                # print(prev_obs.storage_charge)
-                # print(prev_obs.target_dispatch)
+            if done and nb_step != (max_step-1):
+                # there is a game over before the end
                 break
-        print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / 288")
+        print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {max_step}")
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
index 1cc926b..b2a1f1f 100644
--- a/examples/ppo_stable_baselines/ReadMe.md
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -8,7 +8,7 @@ It will be usable on the `l2rpn_icaps_2021` grid2op environment
 
 It is organized as follow:
 
-1) you split the environment into training and validation
+1) you split the environment into training, validation and test
 2) you train the agent (do not hesitate to change the parameters there) on the
    training set
 3) you evaluate it on a dataset not used for training !
@@ -24,6 +24,7 @@ In this phase, we do 3 things:
 - we split the data set into a training, validation and test set. This is quite standard in ML (less in RL) and its main goal is to prevent overfitting. (we remind the scenarios on codalab will be different from the training set provided, though drawn from the same distribution)
 - we initialize the computation of the scores. In the case of l2rpn competitions, the score is cannot be easily made into a reward function, it can only be computed when knowing the entire episode, at the end of the episode\*. 
 - we compute the score of a few "standard" baselines to compared the trained agent with
+- we use the previous runs to compute some "statistics" (average and standard deviation) used to normalize the actions / observations in the later scripts.
 
 \* of course you can make a sparse reward from it. Your agent receive always 0.0 unless when "done = True" (so last step of the episode) where this score can be computed. This is not the approach we took here.
 

From 130046bc9e1cc1d3bf0cb9f19262d3815affaa41 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Tue, 10 May 2022 14:39:29 +0200
Subject: [PATCH 45/56] script for the optimizer for case14 storage working

---
 .../optimcvxpy_educ_case14_storage.py         | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
index f004517..b8af6b8 100644
--- a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
+++ b/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
@@ -25,11 +25,11 @@
     agent = OptimCVXPY(env.action_space,
                        env,
                        penalty_redispatching_unsafe=0.,
-                       penalty_storage_unsafe=0.04,  ###
+                       penalty_storage_unsafe=0.04,
                        penalty_curtailment_unsafe=0.01,
-                       rho_safe=0.95,  ###
-                       rho_danger=0.97,  ###
-                       margin_th_limit=0.93,  ###
+                       rho_safe=0.95,
+                       rho_danger=0.97,
+                       margin_th_limit=0.93,
                        alpha_por_error=0.5,
                        weight_redisp_target=0.3,
                        )
@@ -37,17 +37,17 @@
     # in safe / recovery mode agent tries to fill the storage units as much as possible
     agent.storage_setpoint = env.storage_Emax  
 
-    # print("For do nothing: ")
-    # dn_act = env.action_space()
-    # for scen_id in range(7):
-    #     env.set_id(scen_id)
-    #     obs = env.reset()
-    #     done = False
-    #     for nb_step in tqdm(range(max_step)):
-    #         obs, reward, done, info = env.step(dn_act)
-            # if done and nb_step != (max_step-1):
-    #             break
-    #     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {max_step}")
+    print("For do nothing: ")
+    dn_act = env.action_space()
+    for scen_id in range(7):
+        env.set_id(scen_id)
+        obs = env.reset()
+        done = False
+        for nb_step in tqdm(range(max_step)):
+            obs, reward, done, info = env.step(dn_act)
+            if done and nb_step != (max_step-1):
+                break
+        print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {max_step}")
 
     print("For the optimizer: ")
     for scen_id in range(7):        

From 8ae7735782acc1be0b84271a794033e3d6e23885 Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Tue, 24 May 2022 19:10:54 +0200
Subject: [PATCH 46/56] improving the optimizer for making it work with the
 ieee118

---
 ...se14_storage.py => educ_case14_storage.py} |  0
 ...mcvxpy_wcci_2022.py => l2rpn_wcci_2022.py} | 48 ++++++++++-
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py      | 85 +++++++++++--------
 3 files changed, 95 insertions(+), 38 deletions(-)
 rename examples/optim_cvxpy/{optimcvxpy_educ_case14_storage.py => educ_case14_storage.py} (100%)
 rename examples/optim_cvxpy/{optimcvxpy_wcci_2022.py => l2rpn_wcci_2022.py} (56%)

diff --git a/examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py b/examples/optim_cvxpy/educ_case14_storage.py
similarity index 100%
rename from examples/optim_cvxpy/optimcvxpy_educ_case14_storage.py
rename to examples/optim_cvxpy/educ_case14_storage.py
diff --git a/examples/optim_cvxpy/optimcvxpy_wcci_2022.py b/examples/optim_cvxpy/l2rpn_wcci_2022.py
similarity index 56%
rename from examples/optim_cvxpy/optimcvxpy_wcci_2022.py
rename to examples/optim_cvxpy/l2rpn_wcci_2022.py
index b207cdd..9ffd3c6 100644
--- a/examples/optim_cvxpy/optimcvxpy_wcci_2022.py
+++ b/examples/optim_cvxpy/l2rpn_wcci_2022.py
@@ -7,6 +7,8 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
 import os
+import sys
+import logging
 import grid2op
 from l2rpn_baselines.OptimCVXPY import OptimCVXPY
 from lightsim2grid import LightSimBackend
@@ -21,11 +23,31 @@
                    backend=LightSimBackend()
                    )
 
+
+# logger: logging.Logger = logging.getLogger(__name__)
+# logger.disabled = False
+# logger.addHandler(logging.StreamHandler(sys.stdout))
+# logger.setLevel(level=logging.DEBUG)
+logger = None
+
+
+# scenario : 349 steps
+# agent = OptimCVXPY(env.action_space,
+#                    env,
+#                    penalty_redispatching_unsafe=0.,
+#                    penalty_storage_unsafe=0.01,
+#                    penalty_curtailment_unsafe=0.01,
+#                    logger=logger
+#                    )
+
 agent = OptimCVXPY(env.action_space,
                    env,
                    penalty_redispatching_unsafe=0.,
                    penalty_storage_unsafe=0.01,
                    penalty_curtailment_unsafe=0.01,
+                   penalty_curtailment_safe=0.1,
+                   penalty_redispatching_safe=0.1,
+                   logger=logger
                    )
 
 scen_test = ["2050-01-03_31",
@@ -41,7 +63,11 @@
              "2050-11-14_31",
              "2050-12-19_31",
              ]
-# scen_test = ["2050-02-21_31"]
+# scen_test = ["2050-02-21_31",
+#              "2050-09-26_31"
+#              ]
+# scen_test = ["2050-01-03_31"]
+
 print("For do nothing: ")
 dn_act = env.action_space()
 for scen_id in scen_test:
@@ -49,21 +75,37 @@
     obs = env.reset()
     done = False
     for nb_step in tqdm(range(obs.max_step)):
+        prev_obs = obs
         obs, reward, done, info = env.step(dn_act)
-        if done:
+        if done and (nb_step != prev_obs.max_step - 1):
             break
     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {obs.max_step}")
 
 print("For the optimizer: ")
 for scen_id in scen_test:
+    act = None
     env.set_id(scen_id)
+    env.seed(0)
     obs = env.reset()
     agent.reset(obs)
     done = False
     for nb_step in tqdm(range(obs.max_step)):
         prev_obs = obs
+        prev_act = act
+        # agent._DEBUG = nb_step >= 1840
+        # agent._DEBUG = nb_step >= 949
+        # agent._DEBUG = nb_step >= 705
+        # agent._DEBUG = nb_step >= 154
+        # agent._DEBUG = nb_step >= 82
         act = agent.act(obs)
         obs, reward, done, info = env.step(act)
-        if done:
+        # print(f"{obs.target_dispatch.sum():.2f}, {obs.storage_power.sum():.2f}, {obs.curtailment_mw.sum():.2f}, {obs.curtailment_limit[12]:.2f}")
+        # print([f"{el:.2f}" for el in obs.curtailment_limit[[12, 14, 15, 21, 24]]])
+        # gen_id = 12
+        # print(f"limit: {obs.curtailment_limit[gen_id]:.2f}, "
+        #       f"actual gen: {obs.gen_p[gen_id] / obs.gen_pmax[gen_id] :.2f}, "
+        #       f"possible gen: {obs.gen_p_before_curtail[gen_id] / obs.gen_pmax[gen_id] :.2f}")
+        if done and (nb_step != prev_obs.max_step - 1):
+            # pdb.set_trace()
             break
     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {obs.max_step}")
diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index 8471e38..97bac6c 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -59,7 +59,6 @@ class OptimCVXPY(BaseAgent):
     Have a look at the documentation for more details about the optimization problems
     solved in each case.
     
-    
     Parameters
     ----------
     action_space : `grid2op.Action.ActionSpace`
@@ -159,9 +158,10 @@ def __init__(self,
                  penalty_redispatching_unsafe: float=0.03,
                  penalty_storage_unsafe: float=0.3,
                  penalty_curtailment_safe: float=0.0,
-                 penalty_redispatching_safe: float=0.00,
+                 penalty_redispatching_safe: float=0.0,
                  weight_redisp_target: float=1.0,
                  weight_storage_target: float=1.0,
+                 weight_curtail_target: float=1.0,
                  penalty_storage_safe: float=0.0,
                  margin_rounding: float=0.01,
                  margin_sparse: float=5e-3,
@@ -241,6 +241,8 @@ def __init__(self,
         
         weight_redisp_target: `float`
         
+        weight_curtail_target: `float`
+        
         margin_rounding: `float`
             A margin taken to avoid rounding issues that could lead to infeasible
             actions due to "redispatching above max_ramp_up" for example.
@@ -305,6 +307,8 @@ def __init__(self,
                                                                 nonneg=True)
         self._weight_storage_target: cp.Parameter = cp.Parameter(value=weight_storage_target,
                                                                 nonneg=True)
+        self._weight_curtail_target = cp.Parameter(value=weight_curtail_target,
+                                                   nonneg=True)
         # takes into account the previous errors on the flows (in an additive fashion)
         # new flows are 1/x(theta_or - theta_ex) * alpha_por_error . (prev_flows - obs.p_or)
         self._alpha_por_error: cp.Parameter = cp.Parameter(value=alpha_por_error,
@@ -590,7 +594,7 @@ def _update_constraints_param_unsafe(self, obs: BaseObservation):
             
         self._remove_margin_rounding()
         
-    def _remove_margin_rounding(self):            
+    def _remove_margin_rounding(self):    
         self.storage_down.value[self.storage_down.value > self.margin_rounding] -= self.margin_rounding
         self.storage_up.value[self.storage_up.value > self.margin_rounding] -= self.margin_rounding
         self.curtail_down.value[self.curtail_down.value > self.margin_rounding] -= self.margin_rounding
@@ -796,7 +800,7 @@ def _solve_problem(self, prob, solver_type=None):
             for solver_type in type(self).SOLVER_TYPES:
                 res = self._solve_problem(prob, solver_type=solver_type)
                 if res:
-                    self.logger.info(f"Solver {solver_type} has converged. Stopping there.")
+                    self.logger.info(f"Solver {solver_type} has converged. Stopping solver search now.")
                     return True
             return False
         
@@ -826,7 +830,8 @@ def to_grid2op(self,
                    curtailment: np.ndarray,
                    storage: np.ndarray,
                    redispatching: np.ndarray,
-                   act: BaseAction =None) -> BaseAction:
+                   act: BaseAction =None,
+                   safe=False) -> BaseAction:
         """Convert the action (given as vectors of real number output of the optimizer)
         to a valid grid2op action.
 
@@ -847,6 +852,10 @@ def to_grid2op(self,
         act : BaseAction, optional
             The previous action to modify (if any), by default None
 
+        safe: bool, optional
+            Whether this function is called from the "safe state" (in this case it allows to reset 
+            all curtailment for example) or not.
+            
         Returns
         -------
         BaseAction
@@ -869,22 +878,36 @@ def to_grid2op(self,
         # in the amount of MW you remove, grid2op
         # expects a maximum value
         if np.any(np.abs(curtailment) > 0.):
-            curtailment_ = np.zeros(shape=act.n_gen) -1.
+            curtailment_mw = np.zeros(shape=act.n_gen) -1.
             gen_curt = obs.gen_renewable & (obs.gen_p > 0.1)
             idx_gen = self.bus_gen.value[gen_curt]
             tmp_ = curtailment[idx_gen]
             modif_gen_optim = tmp_ != 0.
             gen_p = 1.0 * obs.gen_p
-            aux_ = curtailment_[gen_curt]
+            aux_ = curtailment_mw[gen_curt]
             aux_[modif_gen_optim] = (gen_p[gen_curt][modif_gen_optim] - 
                                      tmp_[modif_gen_optim] * 
                                      gen_p[gen_curt][modif_gen_optim] / 
                                      self.gen_per_bus.value[idx_gen][modif_gen_optim]
             )
             aux_[~modif_gen_optim] = -1.
-            curtailment_[gen_curt] = aux_
-            curtailment_[~gen_curt] = -1.
-            act.curtail_mw = curtailment_
+            curtailment_mw[gen_curt] = aux_
+            curtailment_mw[~gen_curt] = -1.    
+                       
+            if safe:
+                 # id of the generators that are "curtailed" at their max value
+                 # in safe mode i remove all curtailment
+                gen_id_max = (curtailment_mw >= obs.gen_p_before_curtail ) & obs.gen_renewable
+                if np.any(gen_id_max):
+                    curtailment_mw[gen_id_max] = act.gen_pmax[gen_id_max]
+            act.curtail_mw = curtailment_mw
+        elif safe and np.abs(self.curtail_down.value).max() == 0.:
+            # if curtail_down is all 0. then it means all generators are at their max
+            # output in the observation, curtailment is de facto to 1, I "just"
+            # need to tell it.
+            vect = 1.0 * act.gen_pmax
+            vect[~obs.gen_renewable] = -1.
+            act.curtail_mw = vect
             
         # redispatching
         if np.any(np.abs(redispatching) > 0.):
@@ -948,9 +971,9 @@ def _update_constraints_param_safe(self, obs):
             # storage
             self._add_storage_const(obs, bus_id)
             
-            # curtailment #TODO
-            # mask_ = (self.bus_gen.value == bus_id) & obs.gen_renewable
-            # self.curtail_down.value[bus_id] = obs.gen_pmax[mask_].sum() - tmp_[mask_].sum()
+            # curtailment
+            mask_ = (self.bus_gen.value == bus_id) & obs.gen_renewable
+            self.curtail_down.value[bus_id] = obs.gen_p_before_curtail[mask_].sum() - tmp_[mask_].sum()
             # self.curtail_up.value[bus_id] = tmp_[mask_].sum()
 
             # storage target
@@ -962,10 +985,7 @@ def _update_constraints_param_safe(self, obs):
                 self._past_state_of_charge.value[bus_id] = obs.storage_charge[self.bus_storage.value == bus_id].sum()
             self._past_dispatch.value[bus_id] = obs.target_dispatch[self.bus_gen.value == bus_id].sum()
         
-        #TODO
-        self.curtail_down.value[:] = 0.
-        self.curtail_up.value[:] = 0.
-        
+        self.curtail_up.value[:] = 0.  # never do more curtailment in "safe" mode
         self._remove_margin_rounding()
     
     def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
@@ -978,8 +998,8 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
         theta = cp.Variable(shape=self.nb_max_bus)  # at each bus
         curtailment_mw = cp.Variable(shape=self.nb_max_bus)  # at each bus
         storage = cp.Variable(shape=self.nb_max_bus)  # at each bus
-        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus            
-            
+        redispatching = cp.Variable(shape=self.nb_max_bus)  # at each bus
+        
         # usefull quantities
         f_or = cp.multiply(1. / self._powerlines_x , (theta[self.bus_or.value] - theta[self.bus_ex.value]))
         f_or_corr = f_or - self._alpha_por_error * self._prev_por_error
@@ -1013,20 +1033,17 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
                         # bus and generator variation should sum to 0. (not sure it's mandatory)
                         [energy_added == 0]
                       )
-        
-        # TODO (in ctor) redisp_target
-        # TODO (in ctor) curtail_target
     
         # objective
-        # cost = cp.norm1(gp_var) + cp.norm1(lp_var)
-        cost = ( self._penalty_curtailment_safe * cp.sum_squares(curtailment_mw) + 
+        cost = ( self._penalty_curtailment_safe * cp.sum_squares(curtailment_mw) +  
                  self._penalty_storage_safe * cp.sum_squares(storage) +
                  self._penalty_redispatching_safe * cp.sum_squares(redispatching) +
                  self._weight_redisp_target * cp.sum_squares(dispatch_after_this)  +
-                 self._weight_storage_target * cp.sum_squares(state_of_charge_after - self._storage_target_bus)
+                 self._weight_storage_target * cp.sum_squares(state_of_charge_after - self._storage_target_bus) +
+                 self._weight_curtail_target * cp.sum_squares(curtailment_mw + self.curtail_down)   # I want curtailment to be negative
         )
-            
-        # solve
+        
+        # solve the problem
         prob = cp.Problem(cp.Minimize(cost), constraints)
         has_converged = self._solve_problem(prob)
             
@@ -1034,6 +1051,7 @@ def compute_optimum_safe(self, obs: BaseObservation, l_id=None):
             self.flow_computed[:] = f_or.value
             res = (curtailment_mw.value, storage.value, redispatching.value)
             self._storage_power_obs.value = 0.
+            # TODO : assign a value to curtailment_mw that makes it "+1" (cancel curtailment) in the next stuff
         else:
             self.logger.error(f"Problem with the optimization for all tested solvers ({type(self).SOLVER_TYPES})")
             self.flow_computed[:] = np.NaN
@@ -1074,13 +1092,11 @@ def act(self,
         BaseAction
             The action the agent would do
             
-        """
+        """            
         prev_ok = np.isfinite(self.flow_computed)
-        self._prev_por_error.value[prev_ok] = self.flow_computed[prev_ok] - obs.p_or[prev_ok]
+        # only keep the negative error (meaning I underestimated the flow)
+        self._prev_por_error.value[prev_ok] = np.minimum(self.flow_computed[prev_ok] - obs.p_or[prev_ok], 0.)
         self._prev_por_error.value[~prev_ok] = 0.
-        # print(f"{np.abs(self._prev_por_error.value).mean()}")
-        # print(f"{np.abs(self._prev_por_error.value).max()}")
-        # print(f"step {obs.current_step} target dispatch: {obs.target_dispatch.sum():.2f} / {obs.storage_power.sum():.2f}")
         
         self.flow_computed[:] = np.NaN
         if obs.rho.max() > self.rho_danger:
@@ -1091,7 +1107,7 @@ def act(self,
             # solve the problem
             curtailment, storage, redispatching = self.compute_optimum_unsafe()
             # get back the grid2op representation
-            act = self.to_grid2op(obs, curtailment, storage, redispatching)
+            act = self.to_grid2op(obs, curtailment, storage, redispatching, safe=False)
         elif obs.rho.max() < self.rho_safe:
             # I attempt to get back to a more robust state (reconnect powerlines,
             # storage state of charge close to the target state of charge,
@@ -1109,11 +1125,10 @@ def act(self,
                 # TODO optimization to chose the "best" line to reconnect
                 act.line_set_status = [(l_id, +1)]
             
-            # TODO
             self.update_parameters(obs, unsafe=False)
             curtailment, storage, redispatching = self.compute_optimum_safe(obs, l_id)
             # get back the grid2op representation
-            act = self.to_grid2op(obs, curtailment, storage, redispatching, act)
+            act = self.to_grid2op(obs, curtailment, storage, redispatching, act, safe=True)
         else:
             # I do nothing between rho_danger and rho_safe
             self.logger.info(f"step {obs.current_step}, do nothing mode")

From 2aa350839d144dfb7c7e43d945edf6bf37443050 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Serr=C3=A9?=
 <56162277+gaetanserre@users.noreply.github.com>
Date: Thu, 26 May 2022 13:47:01 +0200
Subject: [PATCH 47/56] Update default value for observation and action
 normalization

---
 l2rpn_baselines/PPO_SB3/evaluate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index ef18f6d..eeb3dce 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -27,8 +27,8 @@ def evaluate(env,
              save_gif=False,
              gymenv_class=GymEnv,
              gymenv_kwargs=None,
-             obs_space_kwargs=None,  # TODO
-             act_space_kwargs=None,  # TODO
+             obs_space_kwargs={},
+             act_space_kwargs={},
              iter_num=None,
              **kwargs):
     """

From b24ec03caa6a649c43c7fb171ab578401f7d1093 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Serr=C3=A9?=
 <56162277+gaetanserre@users.noreply.github.com>
Date: Thu, 26 May 2022 14:05:19 +0200
Subject: [PATCH 48/56] Same default name for train and evaluate

---
 l2rpn_baselines/PPO_SB3/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index ef18f6d..6379ef9 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -18,7 +18,7 @@
 
 def evaluate(env,
              load_path=".",
-             name="ppo_stable_baselines",
+             name="PPO_SB3",
              logs_path=None,
              nb_episode=1,
              nb_process=1,

From c32abde5e498867629b8c4f05e45349aecedba4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Serr=C3=A9?=
 <56162277+gaetanserre@users.noreply.github.com>
Date: Fri, 27 May 2022 13:12:06 +0200
Subject: [PATCH 49/56] Default value for obs_space_kwargs and act_space_kwargs

---
 l2rpn_baselines/PPO_SB3/evaluate.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/l2rpn_baselines/PPO_SB3/evaluate.py b/l2rpn_baselines/PPO_SB3/evaluate.py
index eeb3dce..eff7b39 100644
--- a/l2rpn_baselines/PPO_SB3/evaluate.py
+++ b/l2rpn_baselines/PPO_SB3/evaluate.py
@@ -27,8 +27,8 @@ def evaluate(env,
              save_gif=False,
              gymenv_class=GymEnv,
              gymenv_kwargs=None,
-             obs_space_kwargs={},
-             act_space_kwargs={},
+             obs_space_kwargs=None,
+             act_space_kwargs=None,
              iter_num=None,
              **kwargs):
     """
@@ -149,6 +149,11 @@ def evaluate(env,
             env.close()
 
     """
+    
+    if obs_space_kwargs is None:
+        obs_space_kwargs = {}
+    if act_space_kwargs is None:
+        act_space_kwargs = {}
 
     # load the attributes kept
     my_path = os.path.join(load_path, name)

From f98a523719ace03da352362184f183b77b2ce67f Mon Sep 17 00:00:00 2001
From: DONNOT Benjamin <benjamin.donnot@rte-france.com>
Date: Wed, 1 Jun 2022 14:27:30 +0200
Subject: [PATCH 50/56] improving setup and docs

---
 docs/index.rst | 8 ++++----
 setup.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 3d820f4..6e715c3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -28,7 +28,7 @@ learning to the power grid control problem.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Open source libraries
+   :caption: Some RL implementation examples
 
    ppo_rllib
    ppo_stable_baselines
@@ -47,13 +47,13 @@ in some environment.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Open source libraries
+   :caption: Expert systems and optimizers
 
    expertagent
    optimcvxpy
 
 
-Possible implementation
+Legacy implementations
 ---------------------------
 
 .. note::
@@ -74,7 +74,7 @@ using grid2op more closely that through the gym interface.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Reference baselines
+   :caption: Legacy implementations
 
    utils
    deepqsimple
diff --git a/setup.py b/setup.py
index fd53577..4f7ad6f 100644
--- a/setup.py
+++ b/setup.py
@@ -53,10 +53,10 @@
       long_description_content_type="text/markdown",
       classifiers=[
           'Development Status :: 4 - Beta',
-          'Programming Language :: Python :: 3.6',
           'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8',
           'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.10',
           "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
           "Intended Audience :: Developers",
           "Intended Audience :: Education",
@@ -66,7 +66,7 @@
       keywords='ML powergrid optmization RL power-systems',
       author='Benjamin DONNOT',
       author_email='benjamin.donnot@rte-france.com',
-      url="https://github.com/BDonnot/L2RPN_Baselines",
+      url="https://github.com/rte-france/L2RPN_Baselines",
       license='MPL',
       packages=setuptools.find_packages(),
       include_package_data=True,

From 914dd04378de7537db36fb837805d6ec0cd07988 Mon Sep 17 00:00:00 2001
From: BOGUSLAWSKI Eva Ext <eva.boguslawski@rte-france.com>
Date: Thu, 2 Jun 2022 11:42:07 +0200
Subject: [PATCH 51/56] fix bug with self._th_lim_mw.value

---
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index 97bac6c..6baaaee 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -525,9 +525,13 @@ def _update_topo_param(self, obs: BaseObservation):
             self.bus_storage.value[:] = tmp_
         
     def _update_th_lim_param(self, obs: BaseObservation):
+        threshold_ = 1.
         # take into account reactive value (and current voltage) in thermal limit
         self._th_lim_mw.value[:] =  (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2
-        self._th_lim_mw.value[:] = np.sqrt(self._th_lim_mw.value)
+        # if (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2 is too small, I put 1
+        mask_ok = self._th_lim_mw.value >= threshold_
+        self._th_lim_mw.value[mask_ok] = np.sqrt(self._th_lim_mw.value[mask_ok])
+        self._th_lim_mw.value[~mask_ok] = threshold_ 
         
         # do whatever you can for disconnected lines
         index_disc = obs.v_or == 0.

From f330697d94a84cef4f21c953b6441d63777127d0 Mon Sep 17 00:00:00 2001
From: BOGUSLAWSKI Eva Ext <eva.boguslawski@rte-france.com>
Date: Thu, 2 Jun 2022 11:45:58 +0200
Subject: [PATCH 52/56] remove TODO comment in optimCVXPY bug

---
 l2rpn_baselines/OptimCVXPY/optimCVXPY.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
index 6baaaee..677b65b 100644
--- a/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
+++ b/l2rpn_baselines/OptimCVXPY/optimCVXPY.py
@@ -537,8 +537,6 @@ def _update_th_lim_param(self, obs: BaseObservation):
         index_disc = obs.v_or == 0.
         self._th_lim_mw.value[index_disc] = 0.001 * (obs.thermal_limit * self._v_ref )[index_disc] * np.sqrt(3.)
         
-        # TODO what if (0.001 * obs.thermal_limit)**2 * obs.v_or **2 * 3. - obs.q_or**2 is negative !
-        
     def _update_storage_power_obs(self, obs: BaseObservation):
         self._storage_power_obs.value += obs.storage_power.sum()
         

From c9809e2645e84ec6946f9c46ee1776d78db5eaa2 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 7 Jun 2022 16:56:18 +0200
Subject: [PATCH 53/56] ready for pr for version 0.6

---
 examples/ppo_stable_baselines/A_prep_env.py   | 10 +--
 .../ppo_stable_baselines/B_train_agent.py     | 26 ++++----
 .../C_evaluate_trained_model.py               | 33 ++++++----
 examples/ppo_stable_baselines/ReadMe.md       | 61 ++++++++++++++++++-
 l2rpn_baselines/Kaist                         |  2 +-
 5 files changed, 99 insertions(+), 33 deletions(-)

diff --git a/examples/ppo_stable_baselines/A_prep_env.py b/examples/ppo_stable_baselines/A_prep_env.py
index 3297730..6a05549 100644
--- a/examples/ppo_stable_baselines/A_prep_env.py
+++ b/examples/ppo_stable_baselines/A_prep_env.py
@@ -14,19 +14,19 @@
 import grid2op
 from grid2op.dtypes import dt_int
 from grid2op.Agent import RecoPowerlineAgent
-from grid2op.utils import ScoreL2RPN2020, ScoreICAPS2021, EpisodeStatistics
+from grid2op.utils import EpisodeStatistics, ScoreL2RPN2022, ScoreICAPS2021
 from lightsim2grid import LightSimBackend
 import numpy as np
 
 is_windows = sys.platform.startswith("win32")
 
-env_name = "l2rpn_icaps_2021_small"
-env_name = "l2rpn_wcci_2022_dev"
-env_name = "wcci_2022_dev"
-SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
+env_name = "l2rpn_wcci_2022"
+SCOREUSED = ScoreL2RPN2022  # ScoreICAPS2021
 
 name_stats = "_reco_powerline"
 nb_process_stats = 4 if not is_windows else 1
+# if you still want to use multi processing on windows
+# have a look at the `env.generate_classe()` function
 verbose = 1
 deep_copy = is_windows  # force the deep copy on windows (due to permission issue in symlink in windows)
 
diff --git a/examples/ppo_stable_baselines/B_train_agent.py b/examples/ppo_stable_baselines/B_train_agent.py
index 4942def..68b1686 100644
--- a/examples/ppo_stable_baselines/B_train_agent.py
+++ b/examples/ppo_stable_baselines/B_train_agent.py
@@ -20,14 +20,9 @@
 from grid2op.Action import PlayableAction
 from l2rpn_baselines.utils import GymEnvWithReco, GymEnvWithRecoWithDN
 
-env_name = "l2rpn_icaps_2021_small_train"
-env_name = "l2rpn_wcci_2022_dev_train"
-env_name = "wcci_2022_dev_2"
-env_name = "wcci_2022_dev_train"
-# env_name = "l2rpn_case14_sandbox"
+env_name = "l2rpn_wcci_2022_train"
 save_path = "./saved_model"
-name = "test_normalize_features"
-name = "test_1"
+name = "FirstAgent"
 gymenv_class = GymEnvWithRecoWithDN  # uses the heuristic to do nothing is the grid is not at risk and to reconnect powerline automatically
 max_iter = 7 * 24 * 12  # None to deactivate it
 safe_max_rho = 0.9  # the grid is said "safe" if the rho is lower than this value, it is a really important parameter to tune !
@@ -133,7 +128,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     # same here you can change it as you please
     act_attr_to_keep = ["redispatch", "curtail", "set_storage"]
     # parameters for the learning
-    nb_iter = 300_000
+    nb_iter = 30_000
     learning_rate = 3e-4
     net_arch = [200, 200, 200, 200]
     gamma = 0.999
@@ -149,6 +144,14 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     with open("preprocess_act.json", "r", encoding="utf-8") as f:
         act_space_kwargs = json.load(f)
     
+    # for this, you might want to have a look at: 
+    #  - https://grid2op.readthedocs.io/en/latest/parameters.html#grid2op.Parameters.Parameters.LIMIT_INFEASIBLE_CURTAILMENT_STORAGE_ACTION
+    #  - https://grid2op.readthedocs.io/en/latest/action.html#grid2op.Action.BaseAction.limit_curtail_storage
+    # This really helps the training, but you cannot change
+    # this parameter when you evaluate your agent, so you need to rely
+    # on act.limit_curtail_storage(...) before you give your action to the
+    # environment
+    
     param = env.parameters
     param.LIMIT_INFEASIBLE_CURTAILMENT_STORAGE_ACTION = True
     env.change_parameters(param)
@@ -156,12 +159,7 @@ def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
     if max_iter is not None:
         env.set_max_iter(max_iter)  # one week
     obs = env.reset()
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*february_000$", x) is not None)
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*00$", x) is not None)
-    # env.chronics_handler.real_data.set_filter(lambda x: True)
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*500$", x) is not None)
-    # env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-08-01_.*$", x) is not None)
+    # train on all february month, why not ?
     env.chronics_handler.real_data.set_filter(lambda x: re.match(r".*2050-02-.*$", x) is not None)
     env.chronics_handler.real_data.reset()
     # see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline
diff --git a/examples/ppo_stable_baselines/C_evaluate_trained_model.py b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
index 4020d6d..4c6f54c 100644
--- a/examples/ppo_stable_baselines/C_evaluate_trained_model.py
+++ b/examples/ppo_stable_baselines/C_evaluate_trained_model.py
@@ -10,19 +10,21 @@
 import numpy as np
 
 import grid2op
-from grid2op.utils import ScoreICAPS2021, ScoreL2RPN2020
+from grid2op.utils import ScoreL2RPN2022
+from grid2op.Agent import RecoPowerlineAgent
+
 from lightsim2grid import LightSimBackend
-from grid2op.gym_compat import GymEnv
 
 from l2rpn_baselines.PPO_SB3 import evaluate
 
 from A_prep_env import _aux_get_env, get_env_seed, name_stats
 from B_train_agent import gymenv_class, name, safe_max_rho
+# NB you can also chose to change the "safe_max_rho" parameter 
+# and use a different parameter for evaluation than the one used for 
+# training.
 
-env_name = "l2rpn_icaps_2021_small_val"
-env_name = "l2rpn_wcci_2022_dev_val"
-env_name = "wcci_2022_dev_val"
-SCOREUSED = ScoreL2RPN2020  # ScoreICAPS2021
+env_name = "l2rpn_wcci_2022_val"
+SCOREUSED = ScoreL2RPN2022
 
 agent_name = name
 nb_scenario = 10
@@ -98,18 +100,27 @@ def get_ts_survived_reco(env_name):
                           gymenv_class=gymenv_class,
                           obs_space_kwargs=obs_space_kwargs,
                           act_space_kwargs=act_space_kwargs)
-    _, ts_survived, _ = my_score.get(my_agent)
+    scores_r, n_played_r, total_ts_r = my_score.get(RecoPowerlineAgent(env_val.action_space))
+    scores, n_played, total_ts = my_score.get(my_agent)
+    
+    res_scores = {"scores": [float(score) for score in scores],
+                  "n_played": [int(el) for el in n_played],
+                  "total_ts": [int(el) for el in total_ts]}
     
     # compare with do nothing
     best_than_dn = 0
-    for my_ts, dn_ts in zip(ts_survived, dn_ts_survived):
-        print(f"\t{':-)' if my_ts >= dn_ts else ':-('} I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})")
+    for score, my_ts, dn_ts in zip(scores, n_played, dn_ts_survived):
+        print(f"\t{':-)' if my_ts >= dn_ts else ':-('}:"
+              f"\n\t\t- I survived {my_ts} steps vs {dn_ts} for do nothing ({my_ts - dn_ts})"
+              f"\n\t\t- my score is {score:.2f} (do nothing is 0.)")
         best_than_dn += my_ts >= dn_ts
     print(f"The agent \"{agent_name}\" beats \"do nothing\" baseline in {best_than_dn} out of {len(dn_ts_survived)} episodes")
     
     # compare with reco powerline
     best_than_reco = 0
-    for my_ts, reco_ts in zip(ts_survived, reco_ts_survived):
-        print(f"\t{':-)' if my_ts >= reco_ts else ':-('} I survived {my_ts} steps vs {reco_ts} for reco powerline ({my_ts - reco_ts})")
+    for score, my_ts, reco_ts, score_ in zip(scores, n_played, reco_ts_survived, scores_r):
+        print(f"\t{':-)' if my_ts >= reco_ts else ':-('}:"
+              f"\n\t\t- I survived {my_ts} steps vs {reco_ts} for reco powerline ({my_ts - reco_ts})"
+              f"\n\t\t- my score is {score:.2f} (reco powerline: {score_:.2f})")
         best_than_reco += my_ts >= reco_ts
     print(f"The agent \"{agent_name}\" beats \"reco powerline\" baseline in {best_than_reco} out of {len(reco_ts_survived)} episodes")
diff --git a/examples/ppo_stable_baselines/ReadMe.md b/examples/ppo_stable_baselines/ReadMe.md
index b2a1f1f..b329567 100644
--- a/examples/ppo_stable_baselines/ReadMe.md
+++ b/examples/ppo_stable_baselines/ReadMe.md
@@ -28,6 +28,9 @@ In this phase, we do 3 things:
 
 \* of course you can make a sparse reward from it. Your agent receive always 0.0 unless when "done = True" (so last step of the episode) where this score can be computed. This is not the approach we took here.
 
+**This script might take a while to compute !**
+
+
 ## 2 Training the agent
 
 In this phase where the training takes place and is implemented in the script `B_train_agent.py`
@@ -41,10 +44,64 @@ This also means that the number of steps performed by grid2op is higher than the
 
 What is of particular importance in this script, beside the usual "learning rate" and "neural network architecture" is the "`safe_max_rho`" meta parameters. This parameters controls when the agent is asked to perform an action (when any `obs.rho >= safe_max_rho`). If it's too high, then the agent will almost never act and might not learn anything. If it's too low then the "heuristic" part ("do nothing when the grid is safe") will not be used and the agent might take a lot of time to learn this.
 
+**This script might take a while to compute !**
+
 ## 3 evaluate the agent
 
-TODO
+This is done with the script "`C_evaluate_trained_model.py`" and it reports the score as if you submitted your agent to codalab platform (be aware that the real score depends on the chronix in the validation / test set as well as the seed used but can also vary depending on the versions of the different packages you installed, especially grid2op and lightsim2grid).
+
+Do not hesitate to use this script multiple times to make sure your agent is consistent (*eg* it would be rather optimistic to rely on an agent that performs really well on some runs and really poorly on others...).
+
+You might also refine some of the "parameters" of your agents here. For example, by
+default we use a `safe_max_rho` of 0.9, but you might want to change it to 0.8 or 0.95 to improve the performance of your agent.
 
 ## 4 preparing the submision
 
-TODO
\ No newline at end of file
+Before you submit your agent, you need to make sure that it is trained
+on the same environment than the one it will be tested on (unless you took
+particular care to have an agent able to operate different grids).
+
+You also need to make sure that this agent run using the same packages version
+than the one you used locally. For example, for wcci 2022 competition your agent
+is expected to run on grid2op 1.7.1 and lightsim2grid version 0.7.0
+
+Finally, you need to make sure that your agent does not use packages that are not availlable at test time. The list of available pacakge is usually found on the 
+description of the competition (on this aspect: unless told otherwise, you are free to use any package that you want at training time. Only at test time you are required to provide an agent working with the installed packages).
+
+Once done, all is required to submit your agent to the competition is that you
+provide a "`make_agent(env, path)`" function with:
+
+- `env` being a grid2op environment with the same properties as the one that
+  that will be used to test your agent (but it's not the actual test environment)
+- `path` is the location where the code is executed, it is usefull if you need
+  extra data to use your agent (in this case the weights of the neural networks
+  used in the policy or the normalizer for the observation, etc.)
+
+A possible implementation is:
+
+```python
+from l2rpn_baselines.ppo_stable_baselines3 import evaluate
+safe_max_rho = 0.9  # or the one you find most suited for your agent
+
+def make_agent(env, path):
+   agent, _ = evaluate(env,
+                       load_path=os.path.join(path, TheDireYouUsedToSaveTheAgent),
+                       name=TheNameOfYourAgent,
+                       nb_episode=0,
+                       gymenv_class=ThePossibleGymEnvClass,
+                       gymenv_kwargs={"safe_max_rho": safe_max_rho}  # only if you used the `GymEnvWithRecoWithDN` environment, otherwise any
+                       # other parmeters you might need
+                       )
+   return agent
+
+# NB: by default:
+#  - TheDireYouUsedToSaveTheAgent is "saved_model"
+#  - TheNameOfYourAgent is "PPO_SB3"
+#  - ThePossibleGymEnvClass is GymEnvWithRecoWithDN (that you need to 
+#    import with `from l2rpn_baselines.utils import GymEnvWithRecoWithDN`)
+```
+
+(do not forget to include the `preprocess_act.json` and `preprocess_obs.json` files in the submission as well as the "saved_model" directory, if possible ony containing
+the agent you want to test and not all your runs.)
+
+All you need to do is to follow the instructions in the starting kit of the competition to zip properly all these data.
diff --git a/l2rpn_baselines/Kaist b/l2rpn_baselines/Kaist
index 71c49e7..b2b6561 160000
--- a/l2rpn_baselines/Kaist
+++ b/l2rpn_baselines/Kaist
@@ -1 +1 @@
-Subproject commit 71c49e73ace272fd6d8258a5295abc2b8d3bea1b
+Subproject commit b2b6561a2cc3afbf03fd13ef6d1b334e4ec6c98a

From a1debdabd4756bcb9a384deacb83a27f692d6373 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 7 Jun 2022 17:00:35 +0200
Subject: [PATCH 54/56] update changelog and package version

---
 CHANGELOG.rst               | 7 ++++---
 docs/conf.py                | 4 ++--
 l2rpn_baselines/__init__.py | 3 ++-
 setup.py                    | 2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 74b1b2b..090ce6f 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -13,10 +13,9 @@ Change Log
 - show an example on how to use some "heuristic" in training / evaluation of trained agents
 - show an example of model based RL agent
 - train sowhere a working baseline (that does better than do nothing)
-- refactor the `utils.DeepQAgent` to split the different part better: starting at different steps, not training for a certain number of steps, sampling hard scenarios etc.
-- stack multiple states in `utils/DeepQAgent`
+- show an example of a baseline that uses a GNN
 
-[0.6.0] - 2022-xx-yy
+[0.6.0] - 2022-06-07
 --------------------
 - [BREAKING] name of the file inside the submodule are now lowercase (PEP 8 compliance)
   Use `from l2rpn_baselines.[BASELINENAME] import [BASELINENAME]` by replacing 
@@ -25,7 +24,9 @@ Change Log
 - [FIXED] some bugs (especially in the type of actions) for some agents
 - [ADDED] a code example to use stable baselines 3 (see `l2rpn_baselines.PPO_SB3`)
 - [ADDED] a code example to use RLLIB (see `l2rpn_baselines.PPO_RLLIB`)
+- [ADDED] an optimizer (see `l2rpn_baselines.OptimCVXPY`)
 - [ADDED] some issue templates
+- [ADDED] some examples in the "examples" folder
 
 [0.5.1] - 2021-04-09
 ---------------------
diff --git a/docs/conf.py b/docs/conf.py
index d2f52ef..407ab12 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,8 +22,8 @@
 author = 'Benjamin DONNOT'
 
 # The full version, including alpha/beta/rc tags
-release = '0.5.1'
-version = '0.5'
+release = '0.6.0'
+version = '0.6'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/l2rpn_baselines/__init__.py b/l2rpn_baselines/__init__.py
index 735c0bf..a31d0ae 100644
--- a/l2rpn_baselines/__init__.py
+++ b/l2rpn_baselines/__init__.py
@@ -6,6 +6,8 @@
 # SPDX-License-Identifier: MPL-2.0
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 
+__version__ = "0.6.0"
+
 all_baselines_li = [
     "Template",
     "DoNothing",
@@ -29,4 +31,3 @@
     # utilitary scripts
     "utils"
 ]
-__version__ = "0.5.1"
diff --git a/setup.py b/setup.py
index 4f7ad6f..145f99c 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 import setuptools
 from setuptools import setup
-__version__ = "0.5.1"
+__version__ = "0.6.0"
 
 
 pkgs = {

From 958495380253b03b31de6f97c258a874859fcf7e Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 7 Jun 2022 17:03:15 +0200
Subject: [PATCH 55/56] remove useless code in example

---
 examples/optim_cvxpy/l2rpn_wcci_2022.py | 32 -------------------------
 1 file changed, 32 deletions(-)

diff --git a/examples/optim_cvxpy/l2rpn_wcci_2022.py b/examples/optim_cvxpy/l2rpn_wcci_2022.py
index 9ffd3c6..4a2e884 100644
--- a/examples/optim_cvxpy/l2rpn_wcci_2022.py
+++ b/examples/optim_cvxpy/l2rpn_wcci_2022.py
@@ -22,24 +22,8 @@
                    test=is_test,
                    backend=LightSimBackend()
                    )
-
-
-# logger: logging.Logger = logging.getLogger(__name__)
-# logger.disabled = False
-# logger.addHandler(logging.StreamHandler(sys.stdout))
-# logger.setLevel(level=logging.DEBUG)
 logger = None
 
-
-# scenario : 349 steps
-# agent = OptimCVXPY(env.action_space,
-#                    env,
-#                    penalty_redispatching_unsafe=0.,
-#                    penalty_storage_unsafe=0.01,
-#                    penalty_curtailment_unsafe=0.01,
-#                    logger=logger
-#                    )
-
 agent = OptimCVXPY(env.action_space,
                    env,
                    penalty_redispatching_unsafe=0.,
@@ -63,10 +47,6 @@
              "2050-11-14_31",
              "2050-12-19_31",
              ]
-# scen_test = ["2050-02-21_31",
-#              "2050-09-26_31"
-#              ]
-# scen_test = ["2050-01-03_31"]
 
 print("For do nothing: ")
 dn_act = env.action_space()
@@ -92,20 +72,8 @@
     for nb_step in tqdm(range(obs.max_step)):
         prev_obs = obs
         prev_act = act
-        # agent._DEBUG = nb_step >= 1840
-        # agent._DEBUG = nb_step >= 949
-        # agent._DEBUG = nb_step >= 705
-        # agent._DEBUG = nb_step >= 154
-        # agent._DEBUG = nb_step >= 82
         act = agent.act(obs)
         obs, reward, done, info = env.step(act)
-        # print(f"{obs.target_dispatch.sum():.2f}, {obs.storage_power.sum():.2f}, {obs.curtailment_mw.sum():.2f}, {obs.curtailment_limit[12]:.2f}")
-        # print([f"{el:.2f}" for el in obs.curtailment_limit[[12, 14, 15, 21, 24]]])
-        # gen_id = 12
-        # print(f"limit: {obs.curtailment_limit[gen_id]:.2f}, "
-        #       f"actual gen: {obs.gen_p[gen_id] / obs.gen_pmax[gen_id] :.2f}, "
-        #       f"possible gen: {obs.gen_p_before_curtail[gen_id] / obs.gen_pmax[gen_id] :.2f}")
         if done and (nb_step != prev_obs.max_step - 1):
-            # pdb.set_trace()
             break
     print(f"\t scenario: {os.path.split(env.chronics_handler.get_id())[-1]}: {nb_step + 1} / {obs.max_step}")

From f345dad7b46820022d946f43ea286042ec16ae43 Mon Sep 17 00:00:00 2001
From: BDonnot <benjamin.donnot@gmail.com>
Date: Tue, 7 Jun 2022 17:18:34 +0200
Subject: [PATCH 56/56] adding possibility to post process a grid2op action
 before it's used by the grid2op environment

---
 examples/optim_cvxpy/l2rpn_wcci_2022.py  |  2 +-
 l2rpn_baselines/OptimCVXPY/make_agent.py | 44 +++++++++++++++++++-----
 l2rpn_baselines/utils/gymAgent.py        |  2 ++
 l2rpn_baselines/utils/gymenv_custom.py   | 20 ++++++++++-
 4 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/examples/optim_cvxpy/l2rpn_wcci_2022.py b/examples/optim_cvxpy/l2rpn_wcci_2022.py
index 4a2e884..ecb99fe 100644
--- a/examples/optim_cvxpy/l2rpn_wcci_2022.py
+++ b/examples/optim_cvxpy/l2rpn_wcci_2022.py
@@ -15,7 +15,7 @@
 from tqdm import tqdm
 import pdb
 
-env_name = "wcci_2022_dev"  # name subject to change
+env_name = "l2rpn_wcci_2022"
 is_test = False
 
 env = grid2op.make(env_name,
diff --git a/l2rpn_baselines/OptimCVXPY/make_agent.py b/l2rpn_baselines/OptimCVXPY/make_agent.py
index 5748526..2a91741 100644
--- a/l2rpn_baselines/OptimCVXPY/make_agent.py
+++ b/l2rpn_baselines/OptimCVXPY/make_agent.py
@@ -10,7 +10,25 @@
 from l2rpn_baselines.OptimCVXPY.optimCVXPY import OptimCVXPY
 
 
-def make_agent(env: Environment, dir_path: os.PathLike) -> OptimCVXPY:
+def make_agent(env: Environment,
+               dir_path: os.PathLike,
+               lines_x_pu=None,
+               margin_th_limit: float=0.9,
+               alpha_por_error: float=0.5,
+               rho_danger: float=0.95,
+               rho_safe: float=0.85,
+               penalty_curtailment_unsafe: float=0.1,
+               penalty_redispatching_unsafe: float=0.03,
+               penalty_storage_unsafe: float=0.3,
+               penalty_curtailment_safe: float=0.0,
+               penalty_redispatching_safe: float=0.0,
+               weight_redisp_target: float=1.0,
+               weight_storage_target: float=1.0,
+               weight_curtail_target: float=1.0,
+               penalty_storage_safe: float=0.0,
+               margin_rounding: float=0.01,
+               margin_sparse: float=5e-3,
+               ) -> OptimCVXPY:
     """First example of the function you will need to provide
     to send your agent to l2rpn competitions or
     to use your agent in grid2game.
@@ -30,13 +48,21 @@ def make_agent(env: Environment, dir_path: os.PathLike) -> OptimCVXPY:
     # TODO read the parameters from a config file !
     agent = OptimCVXPY(env.action_space,
                        env,
-                       penalty_redispatching_unsafe=0.,
-                       penalty_storage_unsafe=0.1,
-                       penalty_curtailment_unsafe=0.01,
-                       rho_safe=0.85,
-                       rho_danger=0.9,
-                       margin_th_limit=0.93,
-                       alpha_por_error=0.5,
-                       weight_redisp_target=0.,)
+                       lines_x_pu=lines_x_pu,
+                       margin_th_limit=margin_th_limit,
+                       alpha_por_error=alpha_por_error,
+                       rho_danger=rho_danger,
+                       rho_safe=rho_safe,
+                       penalty_curtailment_unsafe=penalty_curtailment_unsafe,
+                       penalty_redispatching_unsafe=penalty_redispatching_unsafe,
+                       penalty_storage_unsafe=penalty_storage_unsafe,
+                       penalty_curtailment_safe=penalty_curtailment_safe,
+                       penalty_redispatching_safe=penalty_redispatching_safe,
+                       weight_redisp_target=weight_redisp_target,
+                       weight_storage_target=weight_storage_target,
+                       weight_curtail_target=weight_curtail_target,
+                       penalty_storage_safe=penalty_storage_safe,
+                       margin_rounding=margin_rounding,
+                       margin_sparse=margin_sparse)
     
     return agent
diff --git a/l2rpn_baselines/utils/gymAgent.py b/l2rpn_baselines/utils/gymAgent.py
index 2239626..809f8a7 100644
--- a/l2rpn_baselines/utils/gymAgent.py
+++ b/l2rpn_baselines/utils/gymAgent.py
@@ -196,5 +196,7 @@ def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAc
             gym_obs = self._gym_obs_space.to_gym(observation)
             gym_act = self.get_act(gym_obs, reward, done)
             grid2op_act = self._gym_act_space.from_gym(gym_act)
+            # fix the action if needed (for example by limiting curtailment and storage)
+            grid2op_act = self.gymenv.fix_action(grid2op_act)
             
         return grid2op_act
diff --git a/l2rpn_baselines/utils/gymenv_custom.py b/l2rpn_baselines/utils/gymenv_custom.py
index bd4f5f6..cfdb9ae 100644
--- a/l2rpn_baselines/utils/gymenv_custom.py
+++ b/l2rpn_baselines/utils/gymenv_custom.py
@@ -182,6 +182,23 @@ def apply_heuristics_actions(self,
                 break
         return g2op_obs, res_reward, done, info
     
+    def fix_action(self, grid2op_action):
+        """This function can be used to "fix" / "modify" / "cut" / "change"
+        a grid2op action just before it will be applied to the underlying "env.step(...)"
+        
+        This can be used, for example to "limit the curtailment or storage" of the
+        action in case this one is too strong and would lead to a game over.
+
+        By default it does nothing.
+        
+        Parameters
+        ----------
+        grid2op_action : _type_
+            _description_
+            
+        """
+        return grid2op_action
+    
     def step(self, gym_action):
         """This function implements the special case of the "step" function (as seen by the "gym environment") that might
         call multiple times the "step" function of the underlying "grid2op environment" depending on the
@@ -216,7 +233,8 @@ def step(self, gym_action):
             Other type of informations
             
         """
-        g2op_act = self.action_space.from_gym(gym_action)
+        g2op_act_tmp = self.action_space.from_gym(gym_action)
+        g2op_act = self.fix_action(g2op_act_tmp)
         g2op_obs, reward, done, info = self.init_env.step(g2op_act)
         if not done:
             g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)