diff --git a/.gitignore b/.gitignore
index 579d04d..89d4206 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,11 @@ l2rpn_baselines/DeepQSimple/saved_baseline/
 l2rpn_baselines/DuelQLeapNet/logs-eval/
 l2rpn_baselines/DuelQSimple/saved_baseline/
 l2rpn_baselines/SAC/saved_baseline/
-
+l2rpn_baselines/TestLeapNet/model_saved/
+l2rpn_baselines/TestLeapNet/tf_logs/
+l2rpn_baselines/TestLeapNet/logs-eval/
+l2rpn_baselines/LeapNetEncoded/logs-eval/
+l2rpn_baselines/LeapNetEncoded/model_saved/
+l2rpn_baselines/LeapNetEncoded/tf_logs/
+l2rpn_baselines/LeapNetEncoded/tf_logs_test/
+l2rpn_baselines/LeapNetEncoded/model_test/
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 33234a2..32cf9de 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,6 +4,18 @@ Change Log
 --------
 - stack multiple states in `utils/DeepQAgent`
 
+[0.5.0] - 2020-08-??
+--------------------
+- [FIXED] the counting of the action types frequency in tensorboard (for some baselines)
+- [FIXED] a broken Replay buffer `utils.ReplayBuffer` (used in some baselines)
+- [FIXED] a bug in using multiple environments for some baselines
+- [FIXED] wrong q value update for some baselines
+- [IMPROVED] descriptions and computation of the tensorboard information (for some baselines)
+- [IMPROVED] performance optimization for training and usage of some baselines
+- [ADDED] better serializing as json of the `utils.NNParam` class
+- [ADDED] the LeapNetEncoded baselines that uses a leap neural network (leap net) to create an
+  embedding of the state of the powergrid.
+
 [0.4.4] - 2020-07-07
 --------------------
 - [FIXED] now the baselines can fully support the grid2op MultiMix environment.
diff --git a/docs/LeapNetEncoded.rst b/docs/LeapNetEncoded.rst
new file mode 100644
index 0000000..92a3674
--- /dev/null
+++ b/docs/LeapNetEncoded.rst
@@ -0,0 +1,50 @@
+LeapNetEncoded: D3QN on a state encoded by a leap net
+======================================================
+
+TODO reference the original papers `ESANN Paper <https://hal.archives-ouvertes.fr/hal-02268886>`_
+`Leap Net <https://www.sciencedirect.com/science/article/abs/pii/S0925231220305051>`_
+
+That has now be implemented as a github repository `Leap Net Github <https://github.com/BDonnot/leap_net>`_
+
+Description
+-----------
+The Leap is a type of neural network that has showed really good performances on the predictions of flows on
+powerlines based on the injection and the topology.
+
+In this baseline, we use this very same architecture to model encode the powergrid state (at a given
+step).
+
+Then this embedding of the powergrid is used by a neural network (that can be a regular network or
+a leap net) that parametrized the Q function.
+
+An example to train this model is available in the train function :ref:`Example-leapnetenc`.
+
+Exported class
+--------------
+You can use this class with:
+
+.. code-block:: python
+
+    from l2rpn_baselines.LeapNetEncoded import train, evaluate, LeapNetEncoded
+
+.. automodule:: l2rpn_baselines.LeapNetEncoded
+    :members:
+    :autosummary:
+
+Other non exported class
+------------------------
+These classes need to be imported, if you want to import them with (non exhaustive list):
+
+.. code-block:: python
+
+    from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+    from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+
+
+.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN.LeapNetEncoded_NN
+    :members:
+    :autosummary:
+
+.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam
+    :members:
+    :autosummary:
diff --git a/docs/SAC.rst b/docs/SAC.rst
index 772368e..688dc0f 100644
--- a/docs/SAC.rst
+++ b/docs/SAC.rst
@@ -4,6 +4,9 @@ SAC: Soft Actor Critic
 This baseline comes from the paper:
 `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor <https://arxiv.org/abs/1801.01290>`_
 
+**NB** This version is a new implementation of the SAC baselines. We recommend you to start using
+it in new projects. The old version had some issues. Out of backward compatibility, it is still
+available under the name "SACOld".
 
 Description
 -----------
diff --git a/docs/SACOld.rst b/docs/SACOld.rst
new file mode 100644
index 0000000..3cf6237
--- /dev/null
+++ b/docs/SACOld.rst
@@ -0,0 +1,44 @@
+SAC: Soft Actor Critic
+=========================
+
+This baseline comes from the paper:
+`Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor <https://arxiv.org/abs/1801.01290>`_
+
+
+Description
+-----------
+This module proposes an implementation of the SAC algorithm.
+
+**This is an old implementation that is probably not correct, it was included out of
+backward compatibility with earlier version (< 0.5.0) of this package**
+
+An example to train this model is available in the train function :ref:`Example-sacold`.
+
+Exported class
+--------------
+You can use this class with:
+
+.. code-block:: python
+
+    from l2rpn_baselines.SACOld import train, evaluate, SACOld
+
+.. automodule:: l2rpn_baselines.SACOld
+    :members:
+    :autosummary:
+
+Other non exported class
+------------------------
+These classes need to be imported, if you want to import them with (non exhaustive list):
+.. code-block:: python
+
+    from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+    from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
+
+
+.. autoclass:: l2rpn_baselines.SACOld.SACOld_NN.SACOld_NN
+    :members:
+    :autosummary:
+
+.. autoclass:: l2rpn_baselines.SACOld.SACOld_NNParam.SACOld_NNParam
+    :members:
+    :autosummary:
diff --git a/docs/index.rst b/docs/index.rst
index 6492796..cc7641c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -39,6 +39,16 @@ More advanced baselines
 
    DuelQLeapNet
    DoubleDuelingRDQN
+   LeapNetEncoded
+
+
+Deprecated baselines
+---------------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   SACOld
 
 
 Contributions
diff --git a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
index 96c91c8..caf77d8 100644
--- a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
+++ b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py
@@ -165,13 +165,13 @@ def _make_x_tau(self, data):
         res = [data_x, *data_tau]
         return res
 
-    def predict_movement(self, data, epsilon, batch_size=None):
+    def predict_movement(self, data, epsilon, batch_size=None, training=False):
         """Predict movement of game controler where is epsilon
         probability randomly move."""
         if batch_size is None:
             batch_size = data.shape[0]
         data_split = self._make_x_tau(data)
-        res = super().predict_movement(data_split, epsilon=epsilon, batch_size=batch_size)
+        res = super().predict_movement(data_split, epsilon=epsilon, batch_size=batch_size, training=training)
         return res
 
     def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py
new file mode 100644
index 0000000..6c43364
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+
+from l2rpn_baselines.utils import DeepQAgent
+
+DEFAULT_NAME = "LeapNetEncoded"
+
+
+class LeapNetEncoded(DeepQAgent):
+    """
+    Inheriting from :class:`l2rpn_baselines.DeepQAgent` this class implements the  particular agent used for the
+    Double Duelling Deep Q network baseline, with the particularity that the Q network is encoded with a leap net.
+
+    It does nothing in particular.
+    """
+    pass
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
new file mode 100644
index 0000000..e30a0bc
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import numpy as np
+import os
+
+# tf2.0 friendly
+import warnings
+
+import tensorflow as tf
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    from tensorflow.keras.models import Sequential, Model
+    from tensorflow.keras.layers import Activation
+    from tensorflow.keras.layers import Input, Lambda, subtract, add
+    import tensorflow.keras.backend as K
+
+from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
+
+
+# TODO implement that in the leap net package too
+from tensorflow.keras.layers import Dense
+
+
+from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import LtauBis
+
+
+class LeapNetEncoded_NN(BaseDeepQ):
+    """
+    Constructs the desired neural networks.
+
+    More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
+
+    These are:
+
+    - a "state encoder" that uses a leap net to "encode" the observation, or at least the part
+      related to powergrid
+    - a q network, that uses the output of the state encoder to predict which action is best.
+
+    The Q network can have other types of input, and can also be a leap net, see the class
+    :class:`l2rpn_baselines.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam` for more information
+
+    """
+    def __init__(self,
+                 nn_params,
+                 training_param=None):
+        if training_param is None:
+            training_param = TrainingParam()
+        BaseDeepQ.__init__(self,
+                           nn_params,
+                           training_param)
+        self._custom_objects = {"LtauBis": LtauBis}
+        self._max_global_norm_grad = training_param.max_global_norm_grad
+        self._max_value_grad = training_param.max_value_grad
+        self._max_loss = training_param.max_loss
+
+        self.train_lr = 1.0
+
+        # added
+        self.encoded_state = None
+        self.grid_model = None
+        self._schedule_grid_model = None
+        self._optimizer_grid_model = None
+        self._qnet_variables = []
+        self.grid_model_losses_npy = None
+
+        self.construct_q_network()
+
+    def construct_q_network(self):
+        """
+        First the :attr:`l2rpn_baselines.BaseDeepQ.nn_archi` parameters are used to create a neural network
+        to 'encode' the data. Then the leaps occur.
+
+        Afterward the model is split into value an advantage, and treated as usually in any D3QN.
+
+        """
+        # Uses the network architecture found in DeepMind paper
+        # The inputs and outputs size have changed, as well as replacing the convolution by dense layers.
+        self._model = Sequential()
+        inputs_x = [Input(shape=(el,), name="x_{}".format(nm_)) for el, nm_ in
+                    zip(self._nn_archi.x_dims, self._nn_archi.list_attr_obs_x)]
+        inputs_q = [Input(shape=(el,), name="input_q_{}".format(nm_)) for el, nm_ in
+                    zip(self._nn_archi.input_q_dims, self._nn_archi.list_attr_obs_input_q)]
+        inputs_tau = [Input(shape=(el,), name="tau_{}".format(nm_)) for el, nm_ in
+                      zip(self._nn_archi.tau_dims, self._nn_archi.list_attr_obs_tau)]
+        input_topo = Input(shape=(2*self._nn_archi.dim_topo,), name="topo")
+        models_all_inputs = [*inputs_x, *inputs_q, *inputs_tau, input_topo]
+
+        # encode each data type in initial layers
+        encs_out = []
+        for init_val, nm_ in zip(inputs_x, self._nn_archi.list_attr_obs_x):
+            lay = init_val
+            for i, size in enumerate(self._nn_archi.sizes_enc):
+                lay = Dense(size, name="enc_{}_{}".format(nm_, i))(lay) # TODO resnet instead of Dense
+                lay = Activation("relu")(lay)
+            encs_out.append(lay)
+
+        # concatenate all that
+        lay = tf.keras.layers.concatenate(encs_out)
+        # now "lay" is the encoded observation
+
+        # i do a few layer
+        for i, size in enumerate(self._nn_archi.sizes_main):
+            lay = Dense(size, name="main_{}".format(i))(lay)  # TODO resnet instead of Dense
+            lay = Activation("relu")(lay)
+
+        # now i do the leap net to encode the state
+        encoded_state = tf.keras.layers.add([lay, LtauBis(name="leap_topo")([lay, input_topo])],
+                                            name="encoded_state")
+        self.encoded_state = tf.keras.backend.stop_gradient(encoded_state)
+
+        # i predict the full state of the grid given the "control" variables
+        outputs_gm = []
+        grid_model_losses = {}
+        lossWeights = {}  # TODO
+        for sz_out, nm_ in zip(self._nn_archi.gm_out_dims,
+                               self._nn_archi.list_attr_obs_gm_out):
+            lay = encoded_state  # carefull i need my gradients here ! (don't use self.encoded_state)
+            for i, size in enumerate(self._nn_archi.sizes_out_gm):
+                lay = Dense(size, name="{}_{}".format(nm_, i))(lay)
+                lay = Activation("relu")(lay)
+
+            # predict now the variable
+            name_output = "{}_hat".format(nm_)
+            pred_ = Dense(sz_out, name=name_output)(lay)
+            outputs_gm.append(pred_)
+            grid_model_losses[name_output] = "mse"
+
+        # NB grid_model does not use inputs_tau
+        self.grid_model = Model(inputs=models_all_inputs, outputs=outputs_gm, name="grid_model")
+        self._schedule_grid_model, self._optimizer_grid_model = self.make_optimiser()
+        self.grid_model.compile(loss=grid_model_losses, optimizer=self._optimizer_grid_model) # , loss_weights=lossWeights
+
+        # And now let's predict the Q values of each actions given the encoded grid state
+        input_Qnet = inputs_q + [self.encoded_state]
+        # TODO do i pre process the data coming from inputs_q ???
+
+        lay = tf.keras.layers.concatenate(input_Qnet, name="input_Q_network")
+        for i, size in enumerate(self._nn_archi.sizes_Qnet):
+            tmp = Dense(size, name="qvalue_{}".format(i))  # TODO resnet instead of Dense
+            lay = tmp(lay)
+            lay = Activation("relu")(lay)
+            self._qnet_variables += tmp.trainable_weights
+
+        # And i predict the Q value of the action
+        l_tau = lay
+        for el, nm_ in zip(inputs_tau, self._nn_archi.list_attr_obs_tau):
+            tmp = LtauBis(name="leap_{}".format(nm_))
+            l_tau = l_tau + tmp([lay, el])
+            self._qnet_variables += tmp.trainable_weights
+
+        tmp = Dense(self._action_size)
+        advantage = tmp(l_tau)
+        self._qnet_variables += tmp.trainable_weights
+        tmp = Dense(1, name="value")
+        value = tmp(l_tau)
+        self._qnet_variables += tmp.trainable_weights
+
+        meaner = Lambda(lambda x: K.mean(x, axis=1))
+        mn_ = meaner(advantage)
+        tmp = subtract([advantage, mn_])
+        policy = add([tmp, value], name="policy")
+
+        model_all_outputs = [policy]
+        self._model = Model(inputs=models_all_inputs, outputs=model_all_outputs)
+        self._schedule_model, self._optimizer_model = self.make_optimiser()
+        self._model.compile(loss='mse', optimizer=self._optimizer_model)
+
+        self._target_model = Model(inputs=models_all_inputs, outputs=model_all_outputs)
+
+    def _make_x_tau(self, data):
+        # for the x's
+        data_x = []
+        prev = 0
+        for sz, add_, mul_ in zip(self._nn_archi.x_dims,
+                                  self._nn_archi.x_adds,
+                                  self._nn_archi.x_mults):
+            tmp = (data[:, prev:(prev+sz)] + add_) * mul_
+            data_x.append(tmp)
+            prev += sz
+
+        # for the input of the q network
+        data_q = []
+        for sz, add_, mul_ in zip(self._nn_archi.input_q_dims,
+                                  self._nn_archi.input_q_adds,
+                                  self._nn_archi.input_q_mults):
+            data_q.append((data[:, prev:(prev+sz)] + add_) * mul_)
+            prev += sz
+
+        # for the taus
+        data_tau = []
+        for sz, add_, mul_ in zip(self._nn_archi.tau_dims,
+                                  self._nn_archi.tau_adds,
+                                  self._nn_archi.tau_mults):
+            data_tau.append((data[:, prev:(prev+sz)] + add_) * mul_)
+            prev += sz
+
+        # TODO pre process that into different vector
+        data_topo = self._process_topo(data[:, prev:(prev+self._nn_archi.dim_topo)])
+
+        prev += self._nn_archi.dim_topo
+        # TODO predict also gen_q and load_v here, and p_or and q_or and p_ex and q_ex
+        data_flow = []
+        for sz, add_, mul_ in zip(self._nn_archi.gm_out_dims,
+                                  self._nn_archi.gm_out_adds,
+                                  self._nn_archi.gm_out_mults):
+            data_flow.append((data[:, prev:(prev+sz)] + add_) * mul_)
+            prev += sz
+
+        res = [*data_x, *data_q, *data_tau, data_topo], data_flow
+        return res
+
+    def _process_topo(self, topo_vect):
+        """process the topology vector.
+
+         As input grid2op encode it:
+         - -1 disconnected
+         - 1 connected to bus 1
+         - 2 connected to bus 2
+
+         I transform it in a vector having twice as many component with the encoding, if we move
+         "by pairs":
+         - [0,0] -> disconnected
+         - [1,0] -> connected to bus 1
+         - [0,1] -> connected to bus 2
+         """
+        res = np.zeros((topo_vect.shape[0], 2*topo_vect.shape[1]),
+                       dtype=np.float32)
+        tmp_ = np.where(topo_vect == 1.)
+        res[tmp_[0], 2*tmp_[1]] = 1.
+        tmp_ = np.where(topo_vect == 2.)
+        res[tmp_[0], 2*tmp_[1]+1] = 1.
+        return res
+
+    def predict_movement(self, data, epsilon, batch_size=None, training=False):
+        """Predict movement of game controller where is epsilon
+        probability randomly move."""
+        if batch_size is None:
+            batch_size = data.shape[0]
+        data_nn, true_output_grid = self._make_x_tau(data)
+        res = super().predict_movement(data_nn, epsilon=epsilon, batch_size=batch_size, training=False)
+        return res
+
+    def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
+        if batch_size is None:
+            batch_size = s_batch.shape[0]
+        data_nn, true_output_grid = self._make_x_tau(s_batch)
+        data_nn2, true_output_grid2 = self._make_x_tau(s2_batch)
+
+        # train the grid model to accurately predict the state of the grid
+        # TODO predict also gen_q and load_v here, and p_or and q_or and p_ex and q_ex
+        loss1 = self.grid_model.train_on_batch(data_nn, true_output_grid)
+        loss2 = self.grid_model.train_on_batch(data_nn2, true_output_grid2)
+
+        # and now train the q network
+        res = super().train(data_nn,
+                            a_batch,
+                            r_batch,
+                            d_batch,
+                            data_nn2,
+                            tf_writer=tf_writer,
+                            batch_size=batch_size)
+
+        self.grid_model_losses_npy = 0.5*(np.array(loss1) + np.array(loss2))
+        return res
+
+    def train_on_batch(self, model, optimizer_model, x, y_true):
+        """
+        clip the loss
+        """
+        with tf.GradientTape() as tape:
+            # Get y_pred for batch
+            y_pred = model(x)
+            # Compute loss for each sample in the batch
+            # and then clip it
+            batch_loss = self._clipped_batch_loss(y_true, y_pred)
+            # Compute mean scalar loss
+            loss = tf.math.reduce_mean(batch_loss)
+        loss_npy = loss.numpy()
+
+        # Compute gradients
+        grads = tape.gradient(loss, self._qnet_variables)
+
+        # clip gradients
+        if self._max_global_norm_grad is not None:
+            grads, _ = tf.clip_by_global_norm(grads, self._max_global_norm_grad)
+        if self._max_value_grad is not None:
+            grads = [tf.clip_by_value(grad, -self._max_value_grad, self._max_value_grad)
+                     for grad in grads]
+
+        # Apply gradients
+        optimizer_model.apply_gradients(zip(grads, self._qnet_variables))
+        # Store LR
+        self.train_lr = optimizer_model._decayed_lr('float32').numpy()
+
+        # Return loss scalar
+        return loss_npy
+
+    def _clipped_batch_loss(self, y_true, y_pred):
+        sq_error = tf.math.square(y_true - y_pred, name="sq_error")
+        batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, name="batch_sq_error")
+        if self._max_loss is not None:
+            res = tf.clip_by_value(batch_sq_error, 0.0, self._max_loss, name="batch_sq_error_clip")
+        else:
+            res = batch_sq_error
+        return res
+
+    def save_tensorboard(self, current_step):
+        if self.grid_model_losses_npy is not None:
+            for i, el in enumerate(self._nn_archi.list_attr_obs_gm_out):
+                tf.summary.scalar("loss_gridmodel_{}".format(el),
+                                  self.grid_model_losses_npy[i],
+                                  current_step,
+                                  description="Loss of the neural network representing the powergrid "
+                                              "for predicting {}"
+                                              "".format(el))
+
+    @staticmethod
+    def _get_path_model(path, name=None):
+        if name is None:
+            path_model = path
+        else:
+            path_model = os.path.join(path, name)
+        path_target_model = "{}_target".format(path_model)
+        path_grid_model = "{}_grid_model".format(path_model)
+        return path_model, path_target_model, path_grid_model
+
+    def save_network(self, path, name=None, ext="h5"):
+        """
+        Saves all the models with unique names
+        """
+        path_model, path_target_model, path_grid_model = self._get_path_model(path, name)
+        self._model.save('{}.{}'.format(path_model, ext))
+        self._target_model.save('{}.{}'.format(path_target_model, ext))
+        self.grid_model.save('{}.{}'.format(path_grid_model, ext))
+
+    def load_network(self, path, name=None, ext="h5"):
+        """
+        We load all the models using the keras "load_model" function.
+        """
+        path_model, path_target_model, path_grid_model = self._get_path_model(path, name)
+        self.construct_q_network()
+        self._model.load_weights('{}.{}'.format(path_model, ext))
+        self._target_model.load_weights('{}.{}'.format(path_target_model, ext))
+        self.grid_model.load_weights('{}.{}'.format(path_grid_model, ext))
+        if self.verbose:
+            print("Succesfully loaded network.")
\ No newline at end of file
diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py
new file mode 100644
index 0000000..f669770
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+import os
+import numpy as np
+import copy
+
+from l2rpn_baselines.utils import NNParam
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+
+
+class LeapNetEncoded_NNParam(NNParam):
+    """
+    This class implements the type of parameters used by the DuelQLeapNet model.
+
+    More information on the leap net can be found at `Leap Net on Github <https://github.com/BDonnot/leap_net>`_
+
+    Attributes
+    -----------
+    list_attr_obs:
+        currently ot used
+    sizes:
+        currently not used
+    activs:
+        currently not used
+    x_dim:
+        currently not used
+
+    list_attr_obs_x:
+        list of the attribute of the observation that serve as input of the grid model
+        (we recommend ["prod_p", "prod_v", "load_p", "load_q"])
+    list_attr_obs_gm_out:
+        list of the attribute of the observation that serve as output for the grid model
+        (we recommend ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X)
+        though "rho" can be equally good an improve computation time
+    list_attr_obs_input_q:
+        list of the attribute of the observation that serve as input (other that the embedding of the
+        grid state) for the Q network (we recommend to have here anything "time related" for example
+        ["time_before_cooldown_line", "time_before_cooldown_sub",  "actual_dispatch",
+        "target_dispatch",  "day_of_week", "hour_of_day",  "minute_of_hour"] etc.
+    list_attr_obs_tau:
+        If you chose to encode your q network as a leap net it self, then you can put here the attribute
+        you would like the leap net to act on ( ["line_status", "timestep_overflow"] for example)
+    dim_topo: ``int``
+        Dimension of the topology vector (init it with `env.dim_topo`)
+
+    Examples
+    --------
+    All other attributes need to be created once by a call to :func:`LeapNetEncoded_NNParam.compute_dims`:
+
+    ..code-block:: python
+
+        nn_archi.compute_dims(env)
+        nn_archi.center_reduce(env)
+
+    These calls will set up all the attribute that are not set, and register this model to use
+    input data approximately in [-1,1] interval.
+
+
+    """
+    _int_attr = copy.deepcopy(NNParam._int_attr)
+    _float_attr = copy.deepcopy(NNParam._float_attr)
+    _str_attr = copy.deepcopy(NNParam._str_attr)
+    _list_float = copy.deepcopy(NNParam._list_float)
+    _list_str = copy.deepcopy(NNParam._list_str)
+    _list_int = copy.deepcopy(NNParam._list_int)
+
+    _int_attr += ["x_dim", "dim_topo"]
+    _list_str += ["list_attr_obs_tau", "list_attr_obs_x", "list_attr_obs_input_q",
+                  "list_attr_obs_gm_out"]
+    _list_float += ["tau_adds", "tau_mults", "x_adds", "x_mults",
+                    "input_q_adds", "input_q_mults",
+                    "gm_out_adds", "gm_out_mults"]
+    _list_int += ["tau_dims", "x_dims", "gm_out_dims", "input_q_dims",
+                  "sizes_enc", "sizes_main", "sizes_out_gm", "sizes_Qnet"]
+    nn_class = LeapNetEncoded_NN
+
+    def __init__(self,
+                 action_size,
+                 observation_size,  # not used here for retro compatibility with NNParam.from_dict
+                 sizes,
+                 activs,
+                 x_dim,
+
+                 list_attr_obs,
+                 list_attr_obs_tau,
+                 list_attr_obs_x,
+                 list_attr_obs_input_q,
+                 list_attr_obs_gm_out,
+
+                 dim_topo,
+
+                 sizes_enc=(20, 20, 20),
+                 sizes_main=(150, 150, 150),
+                 sizes_out_gm=(100, 40),
+                 sizes_Qnet=(100, 100, 100),
+
+                 input_q_adds=None,
+                 input_q_mults=None,
+                 gm_out_adds=None,
+                 gm_out_mults=None,
+                 tau_adds=None,
+                 tau_mults=None,
+                 x_adds=None,
+                 x_mults=None,
+
+                 tau_dims=None,
+                 x_dims=None,
+                 gm_out_dims=None,
+                 input_q_dims=None,
+                 ):
+        NNParam.__init__(self,
+                         action_size,
+                         observation_size=0,  # not used
+                         sizes=sizes,
+                         activs=activs,
+                         list_attr_obs=list_attr_obs
+                         )
+
+        self.x_dim = x_dim
+
+        self.list_attr_obs_tau = [str(el) for el in list_attr_obs_tau]
+        self._define_adds_mults(tau_adds, "tau_adds", list_attr_obs_tau, 0.)
+        self._define_adds_mults(tau_mults, "tau_mults", list_attr_obs_tau, 1.)
+
+        self.list_attr_obs_x = [str(el) for el in list_attr_obs_x]
+        self._define_adds_mults(x_adds, "x_adds", list_attr_obs_x, 0.)
+        self._define_adds_mults(x_mults, "x_mults", list_attr_obs_x, 1.)
+
+        self.list_attr_obs_input_q = [str(el) for el in list_attr_obs_input_q]
+        self._define_adds_mults(input_q_adds, "input_q_adds", list_attr_obs_input_q, 0.)
+        self._define_adds_mults(input_q_mults, "input_q_mults", list_attr_obs_input_q, 1.)
+
+        self.list_attr_obs_gm_out = [str(el) for el in list_attr_obs_gm_out]
+        self._define_adds_mults(gm_out_adds, "gm_out_adds", list_attr_obs_gm_out, 0.)
+        self._define_adds_mults(gm_out_mults, "gm_out_mults", list_attr_obs_gm_out, 1.)
+
+        # sizes of the neural network "blccks"
+        self.sizes_enc = sizes_enc
+        self.sizes_main = sizes_main
+        self.sizes_out_gm = sizes_out_gm
+        self.sizes_Qnet = sizes_Qnet
+
+        # dimension of the topogly and number of powerline
+        self.dim_topo = dim_topo
+
+        # dimension of the space (can be computed in the self.compute_dims)
+        self.input_q_dims = input_q_dims
+        self.gm_out_dims = gm_out_dims
+        self.x_dims = x_dims
+        self.tau_dims = tau_dims
+
+    def get_obs_attr(self):
+        res = self.list_attr_obs_x + self.list_attr_obs_input_q
+        res += self.list_attr_obs_tau + ["topo_vect"] + self.list_attr_obs_gm_out
+        return res
+
+    def compute_dims(self, env):
+        self.tau_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_tau]
+        self.x_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_x]
+        self.gm_out_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_gm_out]
+        self.input_q_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_input_q]
+
+    def _define_adds_mults(self, vector, varname, attr_composed, default_val):
+        if vector is None:
+            vector = [float(default_val) for _ in attr_composed]
+        setattr(self, varname, vector)
+
+    def center_reduce(self, env):
+        self._center_reduce_vect(env.get_obs(), "x")
+        self._center_reduce_vect(env.get_obs(), "tau")
+        self._center_reduce_vect(env.get_obs(), "gm_out")
+        self._center_reduce_vect(env.get_obs(), "input_q")
diff --git a/l2rpn_baselines/LeapNetEncoded/__init__.py b/l2rpn_baselines/LeapNetEncoded/__init__.py
new file mode 100644
index 0000000..c801db8
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/__init__.py
@@ -0,0 +1,11 @@
+__all__ = [
+    "LeapNetEncoded",
+    "evaluate",
+    "train",
+    "LeapNetEncoded_NN"
+]
+
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded
+from l2rpn_baselines.LeapNetEncoded.evaluate import evaluate
+from l2rpn_baselines.LeapNetEncoded.train import train
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py
new file mode 100644
index 0000000..0d95e8f
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import tensorflow as tf
+
+from grid2op.MakeEnv import make
+from grid2op.Runner import Runner
+from grid2op.Reward import *
+from grid2op.Action import *
+
+from l2rpn_baselines.utils.save_log_gif import save_log_gif
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+
+import pdb
+
+DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
+DEFAULT_NB_EPISODE = 1
+DEFAULT_NB_PROCESS = 1
+DEFAULT_MAX_STEPS = -1
+
+
+def evaluate(env,
+             name=DEFAULT_NAME,
+             load_path=None,
+             logs_path=DEFAULT_LOGS_DIR,
+             nb_episode=DEFAULT_NB_EPISODE,
+             nb_process=DEFAULT_NB_PROCESS,
+             max_steps=DEFAULT_MAX_STEPS,
+             verbose=False,
+             save_gif=False):
+    """
+    How to evaluate the performances of the trained DeepQSimple agent.
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        The environment on which you evaluate your agent.
+
+    name: ``str``
+        The name of the trained baseline
+
+    load_path: ``str``
+        Path where the agent has been stored
+
+    logs_path: ``str``
+        Where to write the results of the assessment
+
+    nb_episode: ``str``
+        How many episodes to run during the assessment of the performances
+
+    nb_process: ``int``
+        On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be
+        unstable on some plaform)
+
+    max_steps: ``int``
+        How many steps at maximum your agent will be assessed
+
+    verbose: ``bool``
+        Currently un used
+
+    save_gif: ``bool``
+        Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
+        take a lot of ram) and drastically increase computation time.
+
+    Returns
+    -------
+    agent: :class:`l2rpn_baselines.utils.DeepQAgent`
+        The loaded agent that has been evaluated thanks to the runner.
+
+    res: ``list``
+        The results of the Runner on which the agent was tested.
+
+
+    Examples
+    -------
+    You can evaluate a DeepQSimple this way:
+
+    .. code-block:: python
+
+        from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward
+        from l2rpn_baselines.LeapNetEncoded import eval
+
+        # Create dataset env
+        env = make("l2rpn_case14_sandbox",
+                   reward_class=L2RPNSandBoxScore,
+                   other_rewards={
+                       "reward": L2RPNReward
+                   })
+
+        # Call evaluation interface
+        evaluate(env,
+                 name="MyAwesomeAgent",
+                 load_path="/WHERE/I/SAVED/THE/MODEL",
+                 logs_path=None,
+                 nb_episode=10,
+                 nb_process=1,
+                 max_steps=-1,
+                 verbose=False,
+                 save_gif=False)
+
+
+    """
+
+    # Limit gpu usage
+    physical_devices = tf.config.list_physical_devices('GPU')
+    if len(physical_devices):
+        tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+    runner_params = env.get_params_for_runner()
+    runner_params["verbose"] = verbose
+
+    if load_path is None:
+        raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.")
+    path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name)
+    nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
+
+    # Run
+    # Create agent
+    agent = LeapNetEncoded(action_space=env.action_space,
+                         name=name,
+                         store_action=nb_process == 1,
+                         nn_archi=nn_archi,
+                         observation_space=env.observation_space)
+
+    # Load weights from file
+    agent.load(load_path)
+
+    # Build runner
+    runner = Runner(**runner_params,
+                    agentClass=None,
+                    agentInstance=agent)
+
+    # Print model summary
+    stringlist = []
+    agent.deep_q._model.summary(print_fn=lambda x: stringlist.append(x))
+    short_model_summary = "\n".join(stringlist)
+    if verbose:
+        print(short_model_summary)
+
+    # Run
+    os.makedirs(logs_path, exist_ok=True)
+    res = runner.run(path_save=logs_path,
+                     nb_episode=nb_episode,
+                     nb_process=nb_process,
+                     max_iter=max_steps,
+                     pbar=verbose)
+
+    # Print summary
+    if verbose:
+        print("Evaluation summary:")
+        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+            msg_tmp = "chronics at: {}".format(chron_name)
+            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+            print(msg_tmp)
+
+        if len(agent.dict_action):
+            # I output some of the actions played
+            print("The agent played {} different action".format(len(agent.dict_action)))
+            for id_, (nb, act, types) in agent.dict_action.items():
+                print("Action with ID {} was played {} times".format(id_, nb))
+                print("{}".format(act))
+                print("-----------")
+
+    # if logs_path is not None:
+    #     for path_dhron, chron_name, cum_reward, nb_time_step, max_ts in res:
+    #         ep_data = EpisodeData.from_disk(logs_path, chron_name)
+
+    if save_gif:
+        if verbose:
+            print("Saving the gif of the episodes")
+        save_log_gif(logs_path, res)
+
+    return agent, res
+
+
+if __name__ == "__main__":
+    from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward
+    from l2rpn_baselines.utils import cli_eval
+
+    # Parse command line
+    args = cli_eval().parse_args()
+
+    # Create dataset env
+    env = make(args.env_name,
+               reward_class=L2RPNSandBoxScore,
+               other_rewards={
+                   "reward": L2RPNReward
+               })
+
+    # Call evaluation interface
+    evaluate(env,
+             name=args.name,
+             load_path=os.path.abspath(args.load_path),
+             logs_path=args.logs_dir,
+             nb_episode=args.nb_episode,
+             nb_process=args.nb_process,
+             max_steps=args.max_steps,
+             verbose=args.verbose,
+             save_gif=args.save_gif)
diff --git a/l2rpn_baselines/LeapNetEncoded/study.py b/l2rpn_baselines/LeapNetEncoded/study.py
new file mode 100644
index 0000000..fe7a958
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/study.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import tensorflow as tf
+import numpy as np
+from tqdm import tqdm
+
+from grid2op.MakeEnv import make
+from grid2op.Reward import *
+from grid2op.Action import *
+
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+
+import pdb
+
+DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
+DEFAULT_NB_EPISODE = 1
+DEFAULT_NB_PROCESS = 1
+DEFAULT_MAX_STEPS = -1
+
+
+def study(env,
+             name=DEFAULT_NAME,
+             load_path=None,
+             logs_path=DEFAULT_LOGS_DIR,
+             nb_episode=DEFAULT_NB_EPISODE,
+             nb_process=DEFAULT_NB_PROCESS,
+             max_steps=DEFAULT_MAX_STEPS,
+             verbose=False,
+             save_gif=False):
+    """study the prediction of the grid_model"""
+
+    # Limit gpu usage
+    physical_devices = tf.config.list_physical_devices('GPU')
+    if len(physical_devices):
+        tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+    runner_params = env.get_params_for_runner()
+    runner_params["verbose"] = verbose
+
+    if load_path is None:
+        raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.")
+    path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name)
+    nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
+
+    # Run
+    # Create agent
+    agent = LeapNetEncoded(action_space=env.action_space,
+                        name=name,
+                        store_action=nb_process == 1,
+                        nn_archi=nn_archi,
+                        observation_space=env.observation_space)
+
+    # Load weights from file
+    agent.load(load_path)
+
+    # Print model summary
+    stringlist = []
+    agent.deep_q._model.summary(print_fn=lambda x: stringlist.append(x))
+    short_model_summary = "\n".join(stringlist)
+    if verbose:
+        print(short_model_summary)
+
+    from grid2op.Agent import RandomAgent
+    from grid2op.Agent import DoNothingAgent
+    policy_agent = DoNothingAgent(env.action_space)
+    policy_agent.seed(0)
+
+    env.set_id(0)
+    res = {k: ([], []) for k in nn_archi.list_attr_obs_gm_out}
+    with tqdm(desc="step") as pbar:
+        for i in range(nb_episode):
+            obs = env.reset()
+            reward = env.reward_range[0]
+            done = False
+            while not done:
+                obs_converted = agent.convert_obs(obs)
+                data_nn, true_output_grid = agent.deep_q._make_x_tau(obs_converted)
+
+                for i, (var_n, add, mult) in enumerate(zip(nn_archi.list_attr_obs_gm_out,
+                                                           nn_archi.gm_out_adds,
+                                                           nn_archi.gm_out_mults)):
+                    tmp = true_output_grid[i]
+                    tmp = tmp / mult - add
+                    true_output_grid[i] = tmp
+
+                pred = agent.deep_q.grid_model.predict(data_nn, batch_size=1)
+                real_pred = []
+                for i, (var_n, add, mult) in enumerate(zip(nn_archi.list_attr_obs_gm_out,
+                                                           nn_archi.gm_out_adds,
+                                                           nn_archi.gm_out_mults)):
+                    tmp = pred[i]
+                    tmp = tmp / mult - add
+                    real_pred.append(tmp)
+
+                for i, var_n in enumerate(nn_archi.list_attr_obs_gm_out):
+                    res[var_n][0].append(real_pred[i].reshape(-1))
+                    res[var_n][1].append(true_output_grid[i].reshape(-1))
+
+                obs, reward, done, info = env.step(policy_agent.act(obs, reward, done))
+                pbar.update(1)
+
+    print("Results")
+    from sklearn.metrics import mean_squared_error
+    for var_n, (pred, true) in res.items():
+        true = np.array(true)
+        pred = np.array(pred)
+        RMSE = mean_squared_error(y_true=true, y_pred=pred, multioutput="raw_values", squared=False)
+        print("RMSE for {}: {:.2f} % variance".format(var_n, 100. * np.mean(RMSE / np.std(true))))
+    return agent
+
+
+if __name__ == "__main__":
+    from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward
+    from l2rpn_baselines.utils import cli_eval
+    from grid2op.Parameters import Parameters
+
+    # Parse command line
+    args = cli_eval().parse_args()
+
+    # Create dataset env
+    param = Parameters()
+    param.NO_OVERFLOW_DISCONNECTION = True
+    env = make(args.env_name,
+               reward_class=L2RPNSandBoxScore,
+               other_rewards={
+                   "reward": L2RPNReward
+               },
+               param=param)
+
+    # Call evaluation interface
+    study(env,
+             name=args.name,
+             load_path=os.path.abspath(args.load_path),
+             logs_path=args.logs_dir,
+             nb_episode=args.nb_episode,
+             nb_process=args.nb_process,
+             max_steps=args.max_steps,
+             verbose=args.verbose,
+             save_gif=args.save_gif)
diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py
new file mode 100755
index 0000000..5d74e13
--- /dev/null
+++ b/l2rpn_baselines/LeapNetEncoded/train.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import warnings
+import tensorflow as tf
+
+from l2rpn_baselines.utils import cli_train
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN
+from l2rpn_baselines.utils import TrainingParam
+from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam
+from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
+
+
+def train(env,
+          name=DEFAULT_NAME,
+          iterations=1,
+          save_path=None,
+          load_path=None,
+          logs_dir=None,
+          training_param=None,
+          filter_action_fun=None,
+          verbose=True,
+          kwargs_converters={},
+          kwargs_archi={}):
+    """
+    This function implements the "training" part of the baselines "SAC". This is the "old" implementation
+    that most likely had bugs. We keep it here for backward compatibility, but it is not recommended to
+    use it on new projects.
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+
+    name: ``str```
+        The name of your agent.
+
+    iterations: ``int``
+        For how many iterations (steps) do you want to train your agent. NB these are not episode, these are steps.
+
+    save_path: ``str``
+        Where do you want to save your baseline.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    logs_dir: ``str``
+        Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
+
+    training_param: :class:`l2rpn_baselines.utils.TrainingParam`
+        The parameters describing the way you will train your model.
+
+    filter_action_fun: ``function``
+        A function to filter the action space. See
+        `IdToAct.filter_action <https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.filter_action>`_
+        documentation.
+
+    verbose: ``bool``
+        If you want something to be printed on the terminal (a better logging strategy will be put at some point)
+
+    kwargs_converters: ``dict``
+        A dictionary containing the key-word arguments pass at this initialization of the
+        :class:`grid2op.Converter.IdToAct` that serves as "Base" for the Agent.
+
+    kwargs_archi: ``dict``
+        Key word arguments used for making the :class:`DeepQ_NNParam` object that will be used to build the baseline.
+
+    Returns
+    -------
+
+    baseline: :class:`DuelQLeapNet`
+        The trained baseline.
+
+
+    .. _Example-leapnetenc:
+
+    Examples
+    ---------
+    Here is an example on how to train a DuelQLeapNet baseline.
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.Reward import L2RPNReward
+        from l2rpn_baselines.utils import TrainingParam
+        from l2rpn_baselines.LeapNetEncoded import train
+
+        # define the environment
+        env = grid2op.make("l2rpn_case14_sandbox",
+                           reward_class=L2RPNReward)
+
+        # use the default training parameters
+        tp = TrainingParam()
+
+        # nn architecture
+        li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"]
+        li_attr_obs_input_q = ["time_before_cooldown_line",
+                               "time_before_cooldown_sub",
+                               "actual_dispatch",
+                               "target_dispatch",
+                               "day_of_week",
+                               "hour_of_day",
+                               "minute_of_hour",
+                               "rho"]
+        li_attr_obs_Tau = ["line_status", "timestep_overflow"]
+        list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X
+
+        kwargs_archi = {'sizes': [],
+                        'activs': [],
+                        'x_dim': -1,
+
+                        "list_attr_obs": li_attr_obs_X,
+                        "list_attr_obs_tau": li_attr_obs_Tau,
+                        "list_attr_obs_x": li_attr_obs_X,
+                        "list_attr_obs_input_q": li_attr_obs_input_q,
+                        "list_attr_obs_gm_out": list_attr_gm_out,
+
+                        'dim_topo': env.dim_topo,
+
+                        "sizes_enc": (50, 50, 50, 50),
+                        "sizes_main": (300, 300, 300),
+                        "sizes_out_gm": (100, ),
+                        "sizes_Qnet": (200, 200, 200)
+                        }
+
+        nm_ = args.name if args.name is not None else DEFAULT_NAME
+        try:
+            train(env,
+                  name=nm_,
+                  iterations=args.num_train_steps,
+                  save_path=args.save_path,
+                  load_path=args.load_path,
+                  logs_dir=args.logs_dir,
+                  training_param=tp,
+                  kwargs_converters=kwargs_converters,
+                  kwargs_archi=kwargs_archi,
+                  verbose=True)
+        finally:
+            env.close()
+
+    """
+
+    # Limit gpu usage
+    try:
+        physical_devices = tf.config.list_physical_devices('GPU')
+        if len(physical_devices) > 0:
+            tf.config.experimental.set_memory_growth(physical_devices[0], True)
+    except AttributeError:
+         # issue of https://stackoverflow.com/questions/59266150/attributeerror-module-tensorflow-core-api-v2-config-has-no-attribute-list-p
+        try:
+            physical_devices = tf.config.experimental.list_physical_devices('GPU')
+            if len(physical_devices) > 0:
+                tf.config.experimental.set_memory_growth(physical_devices[0], True)
+        except Exception:
+            warnings.warn(_WARN_GPU_MEMORY)
+    except Exception:
+        warnings.warn(_WARN_GPU_MEMORY)
+
+    if training_param is None:
+        training_param = TrainingParam()
+
+    # get the size of the action space
+    kwargs_archi["action_size"] = LeapNetEncoded.get_action_size(env.action_space, filter_action_fun, kwargs_converters)
+    kwargs_archi["observation_size"] = 0  # this is not used anyway
+    if load_path is not None:
+        # TODO test that
+        path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name)
+        print("INFO: Reloading a model, the architecture parameters will be ignored")
+        nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
+    else:
+        nn_archi = LeapNetEncoded_NNParam(**kwargs_archi)
+        # because i was lazy enough not to copy paste all the dimensions there
+        nn_archi.compute_dims(env)
+        # because i want data approximately reduced (for the learning process to be smoother)
+        nn_archi.center_reduce(env)
+
+    baseline = LeapNetEncoded(action_space=env.action_space,
+                            nn_archi=nn_archi,
+                            name=name,
+                            istraining=True,
+                            filter_action_fun=filter_action_fun,
+                            verbose=verbose,
+                            **kwargs_converters
+                            )
+
+    if load_path is not None:
+        print("INFO: Reloading a model, training parameters will be ignored")
+        baseline.load(load_path)
+        training_param = baseline._training_param
+
+    baseline.train(env,
+                   iterations,
+                   save_path=save_path,
+                   logdir=logs_dir,
+                   training_param=training_param)
+    # as in our example (and in our explanation) we recommend to save the mode regurlarly in the "train" function
+    # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
+    # recommended to save the "baseline" at the end of this function with:
+    # baseline.save(path_save)
+
+
+if __name__ == "__main__":
+    # import grid2op
+    import numpy as np
+    from grid2op.Parameters import Parameters
+    from grid2op import make
+    from grid2op.Reward import BaseReward
+    from grid2op.dtypes import dt_float
+    import re
+    try:
+        from lightsim2grid.LightSimBackend import LightSimBackend
+        backend = LightSimBackend()
+    except:
+        from grid2op.Backend import PandaPowerBackend
+        backend = PandaPowerBackend()
+
+    args = cli_train().parse_args()
+
+    # is it highly recommended to modify the reward depening on the algorithm.
+    # for example here i will push my algorithm to learn that plyaing illegal or ambiguous action is bad
+    class MyReward(BaseReward):
+        power_rho = int(4)  # to which "power" is put the rho values
+
+        penalty_powerline_disco = 1.0  # how to penalize the powerline disconnected that can be reconnected
+
+        # how to penalize the fact that a powerline will be disconnected next time steps, because it's close to
+        # an overflow
+        penalty_powerline_close_disco = 1.0
+
+        # cap the minimum reward (put None to ignore)
+        cap_min = -0.5  # if the minimum reward is too low, model will not learn easily. It will be "scared" to take
+        # actions. Because you win more or less points 1 by 1, but you can lose them
+        # way way faster.
+
+        def __init__(self):
+            self.reward_min = 0
+            self.reward_max = 0
+            self.ts_overflow = None
+
+        def initialize(self, env):
+            self.ts_overflow = env.parameters.NB_TIMESTEP_OVERFLOW_ALLOWED-1
+            # now calibrate min and max reward
+            hard_overflow = env.parameters.HARD_OVERFLOW_THRESHOLD
+            max_flow_penalty = self.flow_penalty(rho=np.ones(env.n_line) * hard_overflow) / env.n_line
+            disconnected_powerline_that_can_be_reconnected = self.penalty_powerline_disco
+            disconnected_still_connected_powerline_on_overflow = self.penalty_powerline_close_disco
+            self.reward_min = max_flow_penalty - disconnected_powerline_that_can_be_reconnected
+            self.reward_min -= disconnected_still_connected_powerline_on_overflow
+            if self.cap_min is not None:
+                self.reward_min = max(self.reward_min, self.cap_min)
+            self.reward_max = 1.0
+
+        def flow_penalty(self, rho):
+            tmp = 1 - rho**self.power_rho
+            return tmp.sum()
+
+        def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
+            if has_error or is_ambiguous:
+                # previous action was bad
+                res = self.reward_min  #self.reward_min
+            elif is_done:
+                # really strong reward if an episode is over without game over
+                res = self.reward_max
+            else:
+                if env.get_obs() is not None:
+                    obs = env.get_obs()
+                    res = self.flow_penalty(rho=obs.rho)
+                    disconnected_powerline_that_can_be_reconnected = np.sum((obs.time_before_cooldown_line == 0) &
+                                                                                (~obs.line_status))
+                    disconnected_still_connected_powerline_on_overflow = np.sum((obs.timestep_overflow == self.ts_overflow) &
+                                                                                    (obs.rho >= 1.))
+                    res -= disconnected_powerline_that_can_be_reconnected * self.penalty_powerline_disco
+                    res -= disconnected_still_connected_powerline_on_overflow * self.penalty_powerline_close_disco
+                else:
+                    res = env.n_line
+                res /= env.n_line
+                if is_illegal:
+                    if res > 0.:
+                        res *= 0.1  # divide by 10 reward for illegal actions
+                    else:
+                        res *= 10.
+                if not np.isfinite(res):
+                    res = self.reward_min
+
+                if self.cap_min is not None:
+                    res = max(res, self.cap_min)
+            return dt_float(res)
+
+    # Use custom params
+
+    # Create grid2op game environement
+    env_init = None
+    from grid2op.Chronics import MultifolderWithCache
+    game_param = Parameters()
+    game_param.NB_TIMESTEP_COOLDOWN_SUB = 2
+    game_param.NB_TIMESTEP_COOLDOWN_LINE = 2
+    env = make(args.env_name,
+               param=game_param,
+               reward_class=MyReward,
+               backend=backend,
+               chronics_class=MultifolderWithCache
+               )
+
+    if env.name == "l2rpn_wcci_2020":
+        env.chronics_handler.real_data.set_filter(lambda x: re.match(".*Scenario_february_.*$", x) is not None)
+        env.chronics_handler.real_data.reset()
+    elif env.name == "l2rpn_case14_sandbox":
+        # all data can be loaded into memory
+        # env.chronics_handler.real_data.set_filter(lambda x: True)
+        env.chronics_handler.real_data.reset()
+
+    # env.chronics_handler.real_data.
+    env_init = env
+    if args.nb_env > 1:
+        from l2rpn_baselines.utils import make_multi_env
+        env = make_multi_env(env_init=env_init, nb_env=int(args.nb_env))
+
+    tp = TrainingParam()
+    # NN training
+    tp.lr = 1e-5
+    tp.lr_decay_steps = 300000
+    tp.minibatch_size = 32 * int(args.nb_env)
+    tp.update_freq = tp.minibatch_size / 2
+
+    # limit the number of time steps played per scenarios
+    tp.step_increase_nb_iter = None  # None to deactivate it
+    tp.min_iter = None
+    tp.update_nb_iter = None  # once 100 scenarios are solved, increase of "step_increase_nb_iter"
+
+    # oversampling hard scenarios
+    tp.oversampling_rate = None  # None to deactivate it
+
+    # experience replay
+    tp.buffer_size = 1000000
+
+    # just observe the data for a while
+    tp.min_observe = None  # int(10000)
+
+    # e greedy
+    tp.min_observation = 128
+    tp.initial_epsilon = 0.2
+    tp.final_epsilon = 1./(288.)
+    tp.step_for_final_epsilon = int(1e5)
+    # TODO add the "i dont do anything for a few time steps at the beginning of the training"
+
+    # don't start always at the same hour (if not None) otherwise random sampling, see docs
+    tp.random_sample_datetime_start = None
+
+    # saving, logging etc.
+    tp.save_model_each = 10000
+    tp.update_tensorboard_freq = 256
+
+    # which actions i keep
+    if env.name == "l2rpn_case14_sandbox":
+        kwargs_converters = {"all_actions": None,
+                             "set_line_status": False,
+                             "change_line_status": True,
+                             "change_bus_vect": True,
+                             "set_topo_vect": False,
+                             "redispacth": False
+                             }
+    else:
+        kwargs_converters = {"all_actions": None,
+                             "set_line_status": False,
+                             "change_line_status": True,
+                             "change_bus_vect": False,
+                             "set_topo_vect": False,
+                             "redispacth": False
+                             }
+
+    # nn architecture
+    li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"]
+    li_attr_obs_input_q = ["time_before_cooldown_line",
+                           "time_before_cooldown_sub",
+                           "actual_dispatch",
+                           "target_dispatch",
+                           "day_of_week",
+                           "hour_of_day",
+                           "minute_of_hour",
+                           "rho"]
+    li_attr_obs_Tau = ["line_status", "timestep_overflow"]
+    list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X
+
+    kwargs_archi = {'sizes': [],
+                    'activs': [],
+                    'x_dim': -1,
+
+                    "list_attr_obs": li_attr_obs_X,
+                    "list_attr_obs_tau": li_attr_obs_Tau,
+                    "list_attr_obs_x": li_attr_obs_X,
+                    "list_attr_obs_input_q": li_attr_obs_input_q,
+                    "list_attr_obs_gm_out": list_attr_gm_out,
+
+                    'dim_topo': env_init.dim_topo,
+
+                    "sizes_enc": (50, 50, 50, 50),
+                    "sizes_main": (300, 300, 300),
+                    "sizes_out_gm": (100, ),
+                    "sizes_Qnet": (200, 200, 200)
+                    }
+
+    nm_ = args.name if args.name is not None else DEFAULT_NAME
+    # python3 train.py --env_name="l2rpn_wcci_2020" --save_path="model_saved" --logs_dir="tf_logs" --num_train_steps=10000 --name="InitialTest4
+    try:
+        train(env,
+              name=nm_,
+              iterations=args.num_train_steps,
+              save_path=args.save_path,
+              load_path=args.load_path,
+              logs_dir=args.logs_dir,
+              training_param=tp,
+              kwargs_converters=kwargs_converters,
+              kwargs_archi=kwargs_archi,
+              verbose=True)
+    finally:
+        env.close()
+        if args.nb_env > 1:
+            env_init.close()
diff --git a/l2rpn_baselines/SAC/SAC_NN.py b/l2rpn_baselines/SAC/SAC_NN.py
index 0a40967..172fb16 100644
--- a/l2rpn_baselines/SAC/SAC_NN.py
+++ b/l2rpn_baselines/SAC/SAC_NN.py
@@ -35,6 +35,13 @@ class SAC_NN(BaseDeepQ):
     However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom
     parameters class (in this calse :class:`SAC_NNParam` is flexible enough to meet our needs.
 
+    References
+    -----------
+    Original paper:
+    https://arxiv.org/abs/1801.01290
+
+    modified for discrete action space:
+    https://arxiv.org/abs/1910.07207
     """
     def __init__(self,
                  nn_params,
@@ -60,7 +67,6 @@ def __init__(self,
         self.model_Q2 = None
         self.model_policy = None
 
-        self.construct_q_network()
         self.previous_size = 0
         self.previous_eyes = None
         self.previous_arange = None
@@ -77,6 +83,8 @@ def __init__(self,
         self.schedule_lr_value = None
         self.optimizer_value = None
 
+        self.construct_q_network()
+
     def _build_q_NN(self):
         input_states = Input(shape=(self._observation_size,))
         input_action = Input(shape=(self._action_size,))
@@ -147,19 +155,20 @@ def _get_eye_pm(self, batch_size):
             self.previous_size = batch_size
         return self.previous_eyes, self.previous_arange
 
-    def predict_movement(self, data, epsilon, batch_size=None):
+    def predict_movement(self, data, epsilon, batch_size=None, training=False):
         """
         predict the next movements in a vectorized fashion
         """
         if batch_size is None:
             batch_size = data.shape[0]
         rand_val = np.random.random(data.shape[0])
-        p_actions = self.model_policy.predict(data, batch_size=batch_size)
+        p_actions = self.model_policy(data, training=training).numpy()
         opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1)
         opt_policy = 1.0 * opt_policy_orig
         opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
         opt_policy = opt_policy.astype(np.int)
-        return opt_policy, p_actions[:, opt_policy]
+        idx = np.arange(batch_size)
+        return opt_policy, p_actions[idx, opt_policy], p_actions
 
     def _get_eye_train(self, batch_size):
         if batch_size != self.previous_size_train:
@@ -175,18 +184,25 @@ def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, ba
         if batch_size is None:
             batch_size = s_batch.shape[0]
         target = np.zeros((batch_size, 1))
+
         # training of the action state value networks
         last_action = np.zeros((batch_size, self._action_size))
+
         # Save the graph just the first time
         if tf_writer is not None:
             tf.summary.trace_on()
-        fut_action = self.model_value_target.predict(s2_batch, batch_size=batch_size).reshape(-1)
+        # TODO is it s2 or s ? For me it should be s...
+        fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1)
+        # TODO ***_target should be for the Q function instead imho
+
         if tf_writer is not None:
             with tf_writer.as_default():
                 tf.summary.trace_export("model_value_target-graph", 0)
             tf.summary.trace_off()
 
+        # TODO is it rather `targets[:, a_batch]`
         target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
+        # target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
         loss = self.model_Q.train_on_batch([s_batch, last_action], target)
         loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target)
 
diff --git a/l2rpn_baselines/SACOld/SACOld.py b/l2rpn_baselines/SACOld/SACOld.py
new file mode 100644
index 0000000..ab5ad39
--- /dev/null
+++ b/l2rpn_baselines/SACOld/SACOld.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+from l2rpn_baselines.utils import DeepQAgent
+from l2rpn_baselines.SAC.SAC_NN import SAC_NN
+DEFAULT_NAME = "SACOld"
+
+
+class SACOld(DeepQAgent):
+    """
+    This is the :class:`l2rpn_baselines.utils` agent representing the SAC agent (old implementation).
+
+    Please don't use this baseline if you start a new project, prefer using the new, double check
+    SAC implementation instead (:class:`l2rpn_baselines.SAC.SAC`) instead.
+    """
+    pass
diff --git a/l2rpn_baselines/SACOld/SACOld_NN.py b/l2rpn_baselines/SACOld/SACOld_NN.py
new file mode 100644
index 0000000..762d761
--- /dev/null
+++ b/l2rpn_baselines/SACOld/SACOld_NN.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import numpy as np
+import os
+import tensorflow as tf
+
+# tf2.0 friendly
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    from tensorflow.keras.models import load_model, Sequential, Model
+    from tensorflow.keras.layers import Activation, Dense
+    from tensorflow.keras.layers import Input, Concatenate
+
+from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
+
+
+# This class implements the "Sof Actor Critic" model.
+# It is a custom implementation, courtesy to Clement Goubet
+# The original paper is: https://arxiv.org/abs/1801.01290
+class SACOld_NN(BaseDeepQ):
+    """
+    Constructs the desired soft actor critic network.
+
+    Compared to other baselines shown elsewhere (*eg* :class:`l2rpn_baselines.DeepQSimple` or
+    :class:`l2rpn_baselines.DeepQSimple`) the implementation of the SAC is a bit more tricky
+    (and was most likely NOT done properly in this class). For a more correct implementation
+    of SAC please look at the :class:`l2rpn_baselines.SAC.SAC` instead. This class is only
+    present for backward compatibility.
+
+    However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom
+    parameters class (in this case :class:`SACOld_NNParam` is flexible enough to meet our needs.
+
+    References
+    -----------
+    Original paper:
+    https://arxiv.org/abs/1801.01290
+
+    modified for discrete action space:
+    https://arxiv.org/abs/1910.07207
+    """
+    def __init__(self,
+                 nn_params,
+                 training_param=None,
+                 verbose=False):
+        if training_param is None:
+            training_param = TrainingParam()
+        BaseDeepQ.__init__(self,
+                           nn_params,
+                           training_param,
+                           verbose=verbose)
+
+        # TODO add as meta param the number of "Q" you want to use (here 2)
+        # TODO add as meta param size and types of the networks
+        self.average_reward = 0
+        self.life_spent = 1
+        self.qvalue_evolution = np.zeros((0,))
+        self.Is_nan = False
+
+        self.model_value_target = None
+        self.model_value = None
+        self.model_Q = None
+        self.model_Q2 = None
+        self.model_policy = None
+
+        self.previous_size = 0
+        self.previous_eyes = None
+        self.previous_arange = None
+        self.previous_size_train = 0
+        self.previous_eyes_train = None
+
+        # optimizers and learning rate
+        self.schedule_lr_policy = None
+        self.optimizer_policy = None
+        self.schedule_lr_Q = None
+        self.optimizer_Q = None
+        self.schedule_lr_Q2 = None
+        self.optimizer_Q2 = None
+        self.schedule_lr_value = None
+        self.optimizer_value = None
+
+        self.construct_q_network()
+
+    def _build_q_NN(self):
+        input_states = Input(shape=(self._observation_size,))
+        input_action = Input(shape=(self._action_size,))
+
+        input_layer = Concatenate()([input_states, input_action])
+        lay = input_layer
+        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes, self._nn_archi.activs)):
+            lay = Dense(size, name="layer_{}".format(lay_num))(lay)  # put at self.action_size
+            lay = Activation(act)(lay)
+
+        advantage = Dense(1, activation='linear')(lay)
+
+        model = Model(inputs=[input_states, input_action], outputs=[advantage])
+        return model
+
+    def _build_model_value(self):
+        input_states = Input(shape=(self._observation_size,))
+
+        lay = input_states
+        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_value, self._nn_archi.activs_value)):
+            lay = Dense(size)(lay)
+            lay = Activation(act)(lay)
+
+        advantage = Dense(self._action_size, activation='relu')(lay)
+        state_value = Dense(1, activation='linear', name="state_value")(advantage)
+        model = Model(inputs=[input_states], outputs=[state_value])
+        return model
+
+    def construct_q_network(self):
+        """
+        This constructs all the networks needed for the SAC agent.
+        """
+        self.model_Q = self._build_q_NN()
+        self.schedule_lr_Q, self.optimizer_Q = self.make_optimiser()
+        self.model_Q.compile(loss='mse', optimizer=self.optimizer_Q)
+
+        self.model_Q2 = self._build_q_NN()
+        self.schedule_lr_Q2, self.optimizer_Q2 = self.make_optimiser()
+        self.model_Q2.compile(loss='mse', optimizer=self.optimizer_Q2)
+
+        # state value function approximation
+        self.model_value = self._build_model_value()
+        self.schedule_lr_value, self.optimizer_value = self.make_optimiser()
+        self._optimizer_model = self.optimizer_value
+        self.model_value.compile(loss='mse', optimizer=self.optimizer_value)
+
+        self.model_value_target = self._build_model_value()
+        self.model_value_target.set_weights(self.model_value.get_weights())
+
+        # policy function approximation
+        self.model_policy = Sequential()
+        # proba of choosing action a depending on policy pi
+        input_states = Input(shape=(self._observation_size,))
+        lay = input_states
+        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_policy, self._nn_archi.activs_policy)):
+            lay = Dense(size)(lay)
+            lay = Activation(act)(lay)
+        soft_proba = Dense(self._action_size, activation="softmax", kernel_initializer='uniform', name="soft_proba")(lay)
+        self.model_policy = Model(inputs=[input_states], outputs=[soft_proba])
+        self.schedule_lr_policy, self.optimizer_policy = self.make_optimiser()
+        self.model_policy.compile(loss='categorical_crossentropy', optimizer=self.optimizer_policy)
+
+    def _get_eye_pm(self, batch_size):
+        if batch_size != self.previous_size:
+            tmp = np.zeros((batch_size, self._action_size), dtype=np.float32)
+            self.previous_eyes = tmp
+            self.previous_arange = np.arange(batch_size)
+            self.previous_size = batch_size
+        return self.previous_eyes, self.previous_arange
+
+    def predict_movement(self, data, epsilon, batch_size=None, training=False):
+        """
+        predict the next movements in a vectorized fashion
+        """
+        if batch_size is None:
+            batch_size = data.shape[0]
+        rand_val = np.random.random(data.shape[0])
+        p_actions = self.model_policy(data, training=training).numpy()
+        opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1)
+        opt_policy = 1.0 * opt_policy_orig
+        opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
+        opt_policy = opt_policy.astype(np.int)
+        return opt_policy, p_actions[:, opt_policy], p_actions
+
+    def _get_eye_train(self, batch_size):
+        if batch_size != self.previous_size_train:
+            self.previous_eyes_train = np.repeat(np.eye(self._action_size),
+                                                 batch_size * np.ones(self._action_size, dtype=np.int),
+                                                 axis=0)
+            self.previous_eyes_train = tf.convert_to_tensor(self.previous_eyes_train, dtype=tf.float32)
+            self.previous_size_train = batch_size
+        return self.previous_eyes_train
+
+    def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
+        """Trains networks to fit given parameters"""
+        if batch_size is None:
+            batch_size = s_batch.shape[0]
+        target = np.zeros((batch_size, 1))
+
+        # training of the action state value networks
+        last_action = np.zeros((batch_size, self._action_size))
+
+        # Save the graph just the first time
+        if tf_writer is not None:
+            tf.summary.trace_on()
+        # TODO is it s2 or s ? For me it should be s...
+        fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1)
+        # TODO ***_target should be for the Q function instead imho
+
+        if tf_writer is not None:
+            with tf_writer.as_default():
+                tf.summary.trace_export("model_value_target-graph", 0)
+            tf.summary.trace_off()
+
+        # TODO is it rather `targets[:, a_batch]`
+        target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
+        # target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
+        loss = self.model_Q.train_on_batch([s_batch, last_action], target)
+        loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target)
+
+        self.life_spent += 1
+        temp = 1 / np.log(self.life_spent) / 2
+        tiled_batch = np.tile(s_batch, (self._action_size, 1))
+        tiled_batch_ts = tf.convert_to_tensor(tiled_batch)
+        # tiled_batch: output something like: batch, batch, batch
+        # TODO save that somewhere not to compute it each time, you can even save this in the
+        # TODO tensorflow graph!
+        tmp = self._get_eye_train(batch_size)
+
+        action_v1_orig = self.model_Q.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
+        action_v2_orig = self.model_Q2.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
+        action_v1 = action_v1_orig - np.amax(action_v1_orig, axis=-1).reshape(batch_size, 1)
+        new_proba = np.exp(action_v1 / temp) / np.sum(np.exp(action_v1 / temp), axis=-1).reshape(batch_size, 1)
+        new_proba_ts = tf.convert_to_tensor(new_proba)
+        loss_policy = self.model_policy.train_on_batch(s_batch, new_proba_ts)
+
+        target_pi = self.model_policy.predict(s_batch, batch_size=batch_size)
+        value_target = np.fmin(action_v1_orig[0, a_batch], action_v2_orig[0, a_batch]) - np.sum(
+            target_pi * np.log(target_pi + 1e-6))
+        value_target_ts = tf.convert_to_tensor(value_target.reshape(-1, 1))
+        loss_value = self.model_value.train_on_batch(s_batch, value_target_ts)
+
+        self.Is_nan = np.isnan(loss) + np.isnan(loss_2) + np.isnan(loss_policy) + np.isnan(loss_value)
+        return np.all(np.isfinite(loss)) & np.all(np.isfinite(loss_2)) & np.all(np.isfinite(loss_policy)) & \
+               np.all(np.isfinite(loss_value))
+
+    @staticmethod
+    def _get_path_model(path, name=None):
+        if name is None:
+            path_model = path
+        else:
+            path_model = os.path.join(path, name)
+        path_target_model = "{}_target".format(path_model)
+        path_modelQ = "{}_Q".format(path_model)
+        path_modelQ2 = "{}_Q2".format(path_model)
+        path_policy = "{}_policy".format(path_model)
+        return path_model, path_target_model, path_modelQ, path_modelQ2, path_policy
+
+    def save_network(self, path, name=None, ext="h5"):
+        """
+        Saves all the models with unique names
+        """
+        path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
+        self.model_value.save('{}.{}'.format(path_model, ext))
+        self.model_value_target.save('{}.{}'.format(path_target_model, ext))
+        self.model_Q.save('{}.{}'.format(path_modelQ, ext))
+        self.model_Q2.save('{}.{}'.format(path_modelQ2, ext))
+        self.model_policy.save('{}.{}'.format(path_policy, ext))
+
+    def load_network(self, path, name=None, ext="h5"):
+        """
+        We load all the models using the keras "load_model" function.
+        """
+        path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
+        self.construct_q_network()
+        self.model_value.load_weights('{}.{}'.format(path_model, ext))
+        self.model_value_target.load_weights('{}.{}'.format(path_target_model, ext))
+        self.model_Q.load_weights('{}.{}'.format(path_modelQ, ext))
+        self.model_Q2.load_weights('{}.{}'.format(path_modelQ2, ext))
+        self.model_policy.load_weights('{}.{}'.format(path_policy, ext))
+        if self.verbose:
+            print("Succesfully loaded network.")
+
+    def target_train(self):
+        """
+        This update the target model.
+        """
+        model_weights = self.model_value.get_weights()
+        target_model_weights = self.model_value_target.get_weights()
+        for i in range(len(model_weights)):
+            target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \
+                                      target_model_weights[i]
+        self.model_value_target.set_weights(model_weights)
diff --git a/l2rpn_baselines/SACOld/SACOld_NNParam.py b/l2rpn_baselines/SACOld/SACOld_NNParam.py
new file mode 100644
index 0000000..2521842
--- /dev/null
+++ b/l2rpn_baselines/SACOld/SACOld_NNParam.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+import copy
+
+from l2rpn_baselines.utils import NNParam
+from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+
+
+class SACOld_NNParam(NNParam):
+    """
+
+    Do not use this SACOld class, prefer the use of the "more correct"
+    class :class:`l2rpn_baselines.SAC.SAC`  for new projects instead. This module is only here
+    for backward compatibility.
+
+    Attributes
+    ----------
+    sizes_value: ``list``
+        List of integer, each one representing the size of the hidden layer for the "value" neural network.
+
+    activs_value: ``list``
+        List of ``str`` for each hidden layer of the "value" neural network, indicates which hidden layer to use
+
+    sizes_policy: ``list``
+        List of integers, each reprenseting the size of the hidden layer for the "policy" network.
+
+    activs_policy: ``list``
+        List of ``str``: The activation functions (for each layer) of the policy network
+
+    """
+    _int_attr = copy.deepcopy(NNParam._int_attr)
+    _float_attr = copy.deepcopy(NNParam._float_attr)
+    _str_attr = copy.deepcopy(NNParam._str_attr)
+    _list_float = copy.deepcopy(NNParam._list_float)
+    _list_str = copy.deepcopy(NNParam._list_str)
+    _list_int = copy.deepcopy(NNParam._list_int)
+
+    _list_str += ["activs_value", "activs_policy"]
+    _list_int += ["sizes_value", "sizes_policy"]
+
+    nn_class = SACOld_NN
+
+    def __init__(self,
+                 action_size,
+                 observation_size,  # TODO this might not be usefull
+                 sizes,
+                 activs,
+                 list_attr_obs,
+                 sizes_value,
+                 activs_value,
+                 sizes_policy,
+                 activs_policy
+                 ):
+        NNParam.__init__(self,
+                         action_size,
+                         observation_size,  # TODO this might not be usefull
+                         sizes,
+                         activs,
+                         list_attr_obs
+                         )
+        self.sizes_value = sizes_value
+        self.activs_value = activs_value
+        self.sizes_policy = sizes_policy
+        self.activs_policy = activs_policy
diff --git a/l2rpn_baselines/SACOld/__init__.py b/l2rpn_baselines/SACOld/__init__.py
new file mode 100644
index 0000000..a2ccffb
--- /dev/null
+++ b/l2rpn_baselines/SACOld/__init__.py
@@ -0,0 +1,11 @@
+__all__ = [
+    "SACOld",
+    "evaluate",
+    "train",
+    "SACOld_NNParam"
+]
+
+from l2rpn_baselines.SACOld.SACOld import SACOld
+from l2rpn_baselines.SACOld.evaluate import evaluate
+from l2rpn_baselines.SACOld.train import train
+from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
diff --git a/l2rpn_baselines/SACOld/evaluate.py b/l2rpn_baselines/SACOld/evaluate.py
new file mode 100644
index 0000000..c4a710d
--- /dev/null
+++ b/l2rpn_baselines/SACOld/evaluate.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import tensorflow as tf
+
+from grid2op.MakeEnv import make
+from grid2op.Runner import Runner
+from grid2op.Reward import *
+from grid2op.Action import *
+
+from l2rpn_baselines.utils.save_log_gif import save_log_gif
+from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME
+from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
+from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+
+DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline"
+DEFAULT_NB_EPISODE = 1
+DEFAULT_NB_PROCESS = 1
+DEFAULT_MAX_STEPS = -1
+
+
+def evaluate(env,
+             name=DEFAULT_NAME,
+             load_path=None,
+             logs_path=DEFAULT_LOGS_DIR,
+             nb_episode=DEFAULT_NB_EPISODE,
+             nb_process=DEFAULT_NB_PROCESS,
+             max_steps=DEFAULT_MAX_STEPS,
+             verbose=False,
+             save_gif=False):
+    """
+    How to evaluate the performances of the trained SAC agent (old implementation).
+
+    Please use the new implementation instead.
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        The environment on which you evaluate your agent.
+
+    name: ``str``
+        The name of the trained baseline
+
+    load_path: ``str``
+        Path where the agent has been stored
+
+    logs_path: ``str``
+        Where to write the results of the assessment
+
+    nb_episode: ``str``
+        How many episodes to run during the assessment of the performances
+
+    nb_process: ``int``
+        On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be
+        unstable on some plaform)
+
+    max_steps: ``int``
+        How many steps at maximum your agent will be assessed
+
+    verbose: ``bool``
+        Currently un used
+
+    save_gif: ``bool``
+        Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might
+        take a lot of ram) and drastically increase computation time.
+
+    Returns
+    -------
+    agent: :class:`l2rpn_baselines.utils.DeepQAgent`
+        The loaded agent that has been evaluated thanks to the runner.
+
+    res: ``list``
+        The results of the Runner on which the agent was tested.
+
+
+    Examples
+    -------
+    You can evaluate a DeepQSimple this way:
+
+    .. code-block:: python
+
+        from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward
+        from l2rpn_baselines.SACOld import eval
+
+        # Create dataset env
+        env = make("l2rpn_case14_sandbox",
+                   reward_class=L2RPNSandBoxScore,
+                   other_rewards={
+                       "reward": L2RPNReward
+                   })
+
+        # Call evaluation interface
+        evaluate(env,
+                 name="MyAwesomeAgent",
+                 load_path="/WHERE/I/SAVED/THE/MODEL",
+                 logs_path=None,
+                 nb_episode=10,
+                 nb_process=1,
+                 max_steps=-1,
+                 verbose=False,
+                 save_gif=False)
+    """
+
+    # Limit gpu usage
+    physical_devices = tf.config.list_physical_devices('GPU')
+    if len(physical_devices):
+        tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+    runner_params = env.get_params_for_runner()
+    runner_params["verbose"] = verbose
+
+    if load_path is None:
+        raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.")
+    path_model, path_target_model = SACOld_NN.get_path_model(load_path, name)
+    nn_archi = SACOld_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
+
+    # Run
+    # Create agent
+    agent = SACOld(action_space=env.action_space,
+                name=name,
+                store_action=nb_process == 1,
+                nn_archi=nn_archi,
+                observation_space=env.observation_space)
+
+    # Load weights from file
+    agent.load(load_path)
+
+    # Print model summary
+    stringlist = []
+    agent.deep_q.model_value.summary(print_fn=lambda x: stringlist.append(x))
+    short_model_summary = "\n".join(stringlist)
+
+    if verbose:
+        print("Value model: {}".format(short_model_summary))
+
+    # Build runner
+    runner = Runner(**runner_params,
+                    agentClass=None,
+                    agentInstance=agent)
+
+    # Run
+    os.makedirs(logs_path, exist_ok=True)
+    res = runner.run(path_save=logs_path,
+                     nb_episode=nb_episode,
+                     nb_process=nb_process,
+                     max_iter=max_steps,
+                     pbar=verbose)
+
+    # Print summary
+
+    if verbose:
+        print("Evaluation summary:")
+        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
+            msg_tmp = "chronics at: {}".format(chron_name)
+            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
+            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
+            print(msg_tmp)
+
+        if len(agent.dict_action):
+            # I output some of the actions played
+            print("The agent played {} different action".format(len(agent.dict_action)))
+            for id_, (nb, act, types) in agent.dict_action.items():
+                print("Action with ID {} was played {} times".format(id_, nb))
+                print("{}".format(act))
+                print("-----------")
+
+    if save_gif:
+        if verbose:
+            print("Saving the gif of the episodes")
+        save_log_gif(logs_path, res)
+
+    return agent, res
+
+
+if __name__ == "__main__":
+    from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward
+    from l2rpn_baselines.utils import cli_eval
+
+    # Parse command line
+    args = cli_eval().parse_args()
+
+    # Create dataset env
+    env = make(args.env_name,
+               reward_class=L2RPNSandBoxScore,
+               other_rewards={
+                   "reward": L2RPNReward
+               })
+
+    # Call evaluation interface
+    evaluate(env,
+             name=args.name,
+             load_path=os.path.abspath(args.load_path),
+             logs_path=args.logs_dir,
+             nb_episode=args.nb_episode,
+             nb_process=args.nb_process,
+             max_steps=args.max_steps,
+             verbose=args.verbose,
+             save_gif=args.save_gif)
diff --git a/l2rpn_baselines/SACOld/train.py b/l2rpn_baselines/SACOld/train.py
new file mode 100755
index 0000000..48a003c
--- /dev/null
+++ b/l2rpn_baselines/SACOld/train.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020, RTE (https://www.rte-france.com)
+# See AUTHORS.txt
+# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
+# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
+# you can obtain one at http://mozilla.org/MPL/2.0/.
+# SPDX-License-Identifier: MPL-2.0
+# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
+
+import os
+import tensorflow as tf
+import warnings
+
+from l2rpn_baselines.utils import cli_train
+from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME
+from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam
+from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN
+from l2rpn_baselines.utils import TrainingParam
+from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY
+
+
+def train(env,
+          name=DEFAULT_NAME,
+          iterations=1,
+          save_path=None,
+          load_path=None,
+          logs_dir=None,
+          training_param=None,
+          filter_action_fun=None,
+          verbose=True,
+          kwargs_converters={},
+          kwargs_archi={}):
+    """
+    This function implements the "training" part of the baselines "SAC" (old buggy implementation).
+    Please use the :class:`l2rpn_baselines.SAC.SAC` for new projects.
+
+    Parameters
+    ----------
+    env: :class:`grid2op.Environment`
+        Then environment on which you need to train your agent.
+
+    name: ``str```
+        The name of your agent.
+
+    iterations: ``int``
+        For how many iterations (steps) do you want to train your agent. NB these are not episode, these are steps.
+
+    save_path: ``str``
+        Where do you want to save your baseline.
+
+    load_path: ``str``
+        If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded
+        some of the argument provided to this function will not be used.
+
+    logs_dir: ``str``
+        Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them.
+
+    verbose: ``bool``
+        If you want something to be printed on the terminal (a better logging strategy will be put at some point)
+
+    training_param: :class:`l2rpn_baselines.utils.TrainingParam`
+        The parameters describing the way you will train your model.
+
+    filter_action_fun: ``function``
+        A function to filter the action space. See
+        `IdToAct.filter_action <https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.filter_action>`_
+        documentation.
+
+    kwargs_converters: ``dict``
+        A dictionary containing the key-word arguments pass at this initialization of the
+        :class:`grid2op.Converter.IdToAct` that serves as "Base" for the Agent.
+
+    kwargs_archi: ``dict``
+        Key word arguments used for making the :class:`DeepQ_NNParam` object that will be used to build the baseline.
+
+    Returns
+    -------
+
+    baseline: :class:`SACOld`
+        The trained baseline.
+
+
+    .. _Example-sacold:
+
+    Examples
+    ---------
+    Here is an example on how to train a SAC baseline.
+
+    First define a python script, for example
+
+    .. code-block:: python
+
+        import grid2op
+        from grid2op.Reward import L2RPNReward
+        from l2rpn_baselines.utils import TrainingParam, NNParam
+        from l2rpn_baselines.SACOld import train
+
+        # define the environment
+        env = grid2op.make("l2rpn_case14_sandbox",
+                           reward_class=L2RPNReward)
+
+        # use the default training parameters
+        tp = TrainingParam()
+
+        # this will be the list of what part of the observation I want to keep
+        # more information on https://grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes
+        li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                         "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                         "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+
+        # neural network architecture
+        observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
+        sizes_q = [800, 800, 800, 494, 494, 494]  # sizes of each hidden layers
+        sizes_v = [800, 800]  # sizes of each hidden layers
+        sizes_pol = [800, 800, 800, 494, 494, 494]  # sizes of each hidden layers
+        kwargs_archi = {'observation_size': observation_size,
+                        'sizes': sizes_q,
+                        'activs': ["relu" for _ in range(len(sizes_q))],
+                        "list_attr_obs": li_attr_obs_X,
+                        "sizes_value": sizes_v,
+                        "activs_value": ["relu" for _ in range(len(sizes_v))],
+                        "sizes_policy": sizes_pol,
+                        "activs_policy": ["relu" for _ in range(len(sizes_pol))]
+                        }
+
+        # select some part of the action
+        # more information at https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.init_converter
+        kwargs_converters = {"all_actions": None,
+                             "set_line_status": False,
+                             "change_bus_vect": True,
+                             "set_topo_vect": False
+                             }
+        # define the name of the model
+        nm_ = "AnneOnymous"
+        try:
+            train(env,
+                  name=nm_,
+                  iterations=10000,
+                  save_path="/WHERE/I/SAVED/THE/MODEL",
+                  load_path=None,
+                  logs_dir="/WHERE/I/SAVED/THE/LOGS",
+                  training_param=tp,
+                  kwargs_converters=kwargs_converters,
+                  kwargs_archi=kwargs_archi)
+        finally:
+            env.close()
+
+    """
+
+    # Limit gpu usage
+    try:
+        physical_devices = tf.config.list_physical_devices('GPU')
+        if len(physical_devices) > 0:
+            tf.config.experimental.set_memory_growth(physical_devices[0], True)
+    except AttributeError:
+         # issue of https://stackoverflow.com/questions/59266150/attributeerror-module-tensorflow-core-api-v2-config-has-no-attribute-list-p
+        try:
+            physical_devices = tf.config.experimental.list_physical_devices('GPU')
+            if len(physical_devices) > 0:
+                tf.config.experimental.set_memory_growth(physical_devices[0], True)
+        except Exception:
+            warnings.warn(_WARN_GPU_MEMORY)
+    except Exception:
+        warnings.warn(_WARN_GPU_MEMORY)
+
+    if training_param is None:
+        training_param = TrainingParam()
+
+    # compute the proper size for the converter
+    kwargs_archi["action_size"] = SACOld.get_action_size(env.action_space, filter_action_fun, kwargs_converters)
+
+    if load_path is not None:
+        path_model, path_target_model = SACOld_NN.get_path_model(load_path, name)
+        if verbose:
+            print("INFO: Reloading a model, the architecture parameters provided will be ignored")
+        nn_archi = SACOld_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
+    else:
+        nn_archi = SACOld_NNParam(**kwargs_archi)
+
+    baseline = SACOld(action_space=env.action_space,
+                      nn_archi=nn_archi,
+                      name=name,
+                      istraining=True,
+                      verbose=verbose,
+                      **kwargs_converters
+                      )
+
+    if load_path is not None:
+        if verbose:
+            print("INFO: Reloading a model, training parameters will be ignored")
+        baseline.load(load_path)
+        training_param = baseline._training_param
+
+    baseline.train(env,
+                   iterations,
+                   save_path=save_path,
+                   logdir=logs_dir,
+                   training_param=training_param)
+    # as in our example (and in our explanation) we recommend to save the mode regurlarly in the "train" function
+    # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
+    # recommended to save the "baseline" at the end of this function with:
+    # baseline.save(path_save)
+
+
+if __name__ == "__main__":
+    # import grid2op
+    import numpy as np
+    from grid2op.Parameters import Parameters
+    from grid2op import make
+    from grid2op.Reward import L2RPNReward
+    import re
+    try:
+        from lightsim2grid.LightSimBackend import LightSimBackend
+        backend = LightSimBackend()
+    except:
+        from grid2op.Backend import PandaPowerBackend
+        backend = PandaPowerBackend()
+
+    args = cli_train().parse_args()
+
+    # is it highly recommended to modify the reward depening on the algorithm.
+    # for example here i will push my algorithm to learn that plyaing illegal or ambiguous action is bad
+    class MyReward(L2RPNReward):
+        def initialize(self, env):
+            self.reward_min = 0.0
+            self.reward_max = 1.0
+
+        def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous):
+            if has_error or is_illegal or is_ambiguous:
+                # previous action was bad
+                res = self.reward_min
+            elif is_done:
+                # really strong reward if an episode is over without game over
+                res = self.reward_max
+            else:
+                res = super().__call__(action, env, has_error, is_done, is_illegal, is_ambiguous)
+                res /= env.n_line
+                if not np.isfinite(res):
+                    res = self.reward_min
+            return res
+
+    # Use custom params
+
+    # Create grid2op game environement
+    env_init = None
+    try:
+        from grid2op.Chronics import MultifolderWithCache
+    except:
+        from grid2op.Chronics import MultiFolder
+        MultifolderWithCache = MultiFolder
+
+    game_param = Parameters()
+    game_param.NB_TIMESTEP_COOLDOWN_SUB = 2
+    game_param.NB_TIMESTEP_COOLDOWN_LINE = 2
+    env = make(args.env_name,
+               param=game_param,
+               reward_class=MyReward,
+               backend=backend,
+               chronics_class=MultifolderWithCache
+               )
+    # env.chronics_handler.set_max_iter(7*288)
+    try:
+        env.chronics_handler.real_data.set_filter(lambda x: re.match(".*((03)|(72)|(57))$", x) is not None)
+        env.chronics_handler.real_data.reset()
+    except RuntimeError as exc_:
+        raise exc_
+    except AttributeError as exc_:
+        # not available in all grid2op version
+        pass
+    # env.chronics_handler.real_data.
+    env_init = env
+    if args.nb_env > 1:
+        from l2rpn_baselines.utils import make_multi_env
+        env = make_multi_env(env_init=env_init, nb_env=int(args.nb_env))
+
+    tp = TrainingParam()
+
+    # NN training
+    tp.lr = 1e-4
+    tp.lr_decay_steps = 30000
+    tp.minibatch_size = 256
+    tp.update_freq = 128
+
+    # limit the number of time steps played per scenarios
+    tp.step_increase_nb_iter = 100  # None to deactivate it
+    tp.min_iter = 10
+    tp.update_nb_iter = 100  # once 100 scenarios are solved, increase of "step_increase_nb_iter"
+
+    # oversampling hard scenarios
+    tp.oversampling_rate = 3
+
+    # experience replay
+    tp.buffer_size = 1000000
+
+    # e greedy
+    tp.min_observation = 10000
+    tp.initial_epsilon = 0.4
+    tp.final_epsilon = 1./(2*7*288.)
+    tp.step_for_final_epsilon = int(1e5)
+
+    # don't start always at the same hour (if not None) otherwise random sampling, see docs
+    tp.random_sample_datetime_start = None
+
+    # saving, logging etc.
+    tp.save_model_each = 10000
+    tp.update_tensorboard_freq = 256
+
+    li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
+                     "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
+                     "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"]
+
+    # nn architecture
+    observation_size = SACOld_NNParam.get_obs_size(env_init, li_attr_obs_X)
+    sizes_q = [800, 800, 800, 494, 494, 494]  # sizes of each hidden layers
+    sizes_v = [800, 800]  # sizes of each hidden layers
+    sizes_pol = [800, 800, 800, 494, 494, 494]  # sizes of each hidden layers
+    kwargs_archi = {'observation_size': observation_size,
+                    'sizes': sizes_q,
+                    'activs': ["relu" for _ in range(len(sizes_q))],
+                    "list_attr_obs": li_attr_obs_X,
+                    "sizes_value": sizes_v,
+                    "activs_value": ["relu" for _ in range(len(sizes_v))],
+                    "sizes_policy": sizes_pol,
+                    "activs_policy": ["relu" for _ in range(len(sizes_pol))]
+                    }
+
+    # which actions i keep
+    kwargs_converters = {"all_actions": None,
+                         "set_line_status": False,
+                         "change_bus_vect": True,
+                         "set_topo_vect": False,
+                         }
+    nm_ = args.name if args.name is not None else DEFAULT_NAME
+    try:
+        train(env,
+              name=nm_,
+              iterations=args.num_train_steps,
+              save_path=args.save_path,
+              load_path=args.load_path,
+              logs_dir=args.logs_dir,
+              training_param=tp,
+              kwargs_converters=kwargs_converters,
+              kwargs_archi=kwargs_archi)
+    finally:
+        env.close()
+        if args.nb_env > 1:
+            env_init.close()
diff --git a/l2rpn_baselines/__init__.py b/l2rpn_baselines/__init__.py
index 6a9b047..54668e5 100644
--- a/l2rpn_baselines/__init__.py
+++ b/l2rpn_baselines/__init__.py
@@ -7,6 +7,9 @@
     "DeepQSimple",
     "DuelQSimple",
     "SAC",
+    "LeapNetEncoded",
+    # Backward compatibility
+    "SACOld",
     # contribution
     "PandapowerOPFAgent",
     "Geirina",
diff --git a/l2rpn_baselines/test/test_import.py b/l2rpn_baselines/test/test_import.py
index b022062..ba16d6b 100644
--- a/l2rpn_baselines/test/test_import.py
+++ b/l2rpn_baselines/test/test_import.py
@@ -50,6 +50,16 @@ def load_module(self):
         return "SAC"
 
 
+class TestSACOld(TestImport, unittest.TestCase):
+    def load_module(self):
+        return "SACOld"
+
+
+class TestLeapNetEnc(TestImport, unittest.TestCase):
+    def load_module(self):
+        return "LeapNetEncoded"
+
+
 class TestDuelQSimple(TestImport, unittest.TestCase):
     def load_module(self):
         return "DuelQSimple"
diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py
index 9ba77dc..b9f91a5 100644
--- a/l2rpn_baselines/test/test_train_eval.py
+++ b/l2rpn_baselines/test/test_train_eval.py
@@ -23,10 +23,14 @@
 from l2rpn_baselines.DeepQSimple import evaluate as eval_dqn
 from l2rpn_baselines.DuelQSimple import train as train_d3qs
 from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qs
+from l2rpn_baselines.SACOld import train as train_sacold
+from l2rpn_baselines.SACOld import evaluate as eval_sacold
 from l2rpn_baselines.SAC import train as train_sac
 from l2rpn_baselines.SAC import evaluate as eval_sac
 from l2rpn_baselines.DuelQLeapNet import train as train_leap
 from l2rpn_baselines.DuelQLeapNet import evaluate as eval_leap
+from l2rpn_baselines.LeapNetEncoded import train as train_leapenc
+from l2rpn_baselines.LeapNetEncoded import evaluate as eval_leapenc
 from l2rpn_baselines.DoubleDuelingDQN import train as train_d3qn
 from l2rpn_baselines.DoubleDuelingDQN import evaluate as eval_d3qn
 from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig as d3qn_cfg
@@ -273,6 +277,62 @@ def test_train_eval(self):
                       kwargs_archi=kwargs_archi)
 
             baseline_2 = eval_d3qs(env,
+                                   name=nm_,
+                                   load_path=tmp_dir,
+                                   logs_path=tmp_dir,
+                                   nb_episode=1,
+                                   nb_process=1,
+                                   max_steps=30,
+                                   verbose=False,
+                                   save_gif=False)
+
+
+class TestSACOld(unittest.TestCase):
+    def test_train_eval(self):
+        tp = TrainingParam()
+        tp.buffer_size = 100
+        tp.minibatch_size = 8
+        tp.update_freq = 32
+        tp.min_observation = 32
+        tmp_dir = tempfile.mkdtemp()
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make("rte_case5_example", test=True)
+            li_attr_obs_X = ["prod_p", "load_p", "rho"]
+
+            # neural network architecture
+            observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
+            sizes_q = [100, 50, 10]  # sizes of each hidden layers
+            sizes_v = [100, 100]  # sizes of each hidden layers
+            sizes_pol = [100, 10]  # sizes of each hidden layers
+            kwargs_archi = {'observation_size': observation_size,
+                            'sizes': sizes_q,
+                            'activs': ["relu" for _ in range(len(sizes_q))],
+                            "list_attr_obs": li_attr_obs_X,
+                            "sizes_value": sizes_v,
+                            "activs_value": ["relu" for _ in range(len(sizes_v))],
+                            "sizes_policy": sizes_pol,
+                            "activs_policy": ["relu" for _ in range(len(sizes_pol))]
+                            }
+
+            kwargs_converters = {"all_actions": None,
+                                 "set_line_status": False,
+                                 "change_bus_vect": True,
+                                 "set_topo_vect": False
+                                 }
+            nm_ = "AnneOnymous"
+            train_sacold(env,
+                      name=nm_,
+                      iterations=100,
+                      save_path=tmp_dir,
+                      load_path=None,
+                      logs_dir=tmp_dir,
+                      training_param=tp,
+                      verbose=False,
+                      kwargs_converters=kwargs_converters,
+                      kwargs_archi=kwargs_archi)
+
+            baseline_2 = eval_sacold(env,
                                   name=nm_,
                                   load_path=tmp_dir,
                                   logs_path=tmp_dir,
@@ -396,6 +456,78 @@ def test_train_eval(self):
                                    save_gif=False)
 
 
+class TestLeapNetEncoded(unittest.TestCase):
+    def test_train_eval(self):
+        tp = TrainingParam()
+        tp.buffer_size = 100
+        tp.minibatch_size = 8
+        tp.update_freq = 32
+        tp.min_observation = 32
+        tmp_dir = tempfile.mkdtemp()
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            env = grid2op.make("rte_case5_example", test=True)
+            kwargs_converters = {"all_actions": None,
+                                 "set_line_status": False,
+                                 "change_line_status": True,
+                                 "change_bus_vect": False,
+                                 "set_topo_vect": False,
+                                 "redispacth": False
+                                 }
+
+            # nn architecture
+            li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"]
+            li_attr_obs_input_q = ["time_before_cooldown_line",
+                                   "time_before_cooldown_sub",
+                                   "actual_dispatch",
+                                   "target_dispatch",
+                                   "day_of_week",
+                                   "hour_of_day",
+                                   "minute_of_hour",
+                                   "rho"]
+            li_attr_obs_Tau = ["line_status", "timestep_overflow"]
+            list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X
+
+            kwargs_archi = {'sizes': [],
+                            'activs': [],
+                            'x_dim': -1,
+
+                            "list_attr_obs": li_attr_obs_X,
+                            "list_attr_obs_tau": li_attr_obs_Tau,
+                            "list_attr_obs_x": li_attr_obs_X,
+                            "list_attr_obs_input_q": li_attr_obs_input_q,
+                            "list_attr_obs_gm_out": list_attr_gm_out,
+
+                            'dim_topo': env.dim_topo,
+
+                            "sizes_enc": (10, 10, 10, 10),
+                            "sizes_main": (50, ),
+                            "sizes_out_gm": (50,),
+                            "sizes_Qnet": (50, 50, )
+                            }
+            nm_ = "AnneOnymous"
+            train_leapenc(env,
+                       name=nm_,
+                       iterations=100,
+                       save_path=tmp_dir,
+                       load_path=None,
+                       logs_dir=tmp_dir,
+                       training_param=tp,
+                       verbose=False,
+                       kwargs_converters=kwargs_converters,
+                       kwargs_archi=kwargs_archi)
+
+            baseline_2 = eval_leapenc(env,
+                                   name=nm_,
+                                   load_path=tmp_dir,
+                                   logs_path=tmp_dir,
+                                   nb_episode=1,
+                                   nb_process=1,
+                                   max_steps=30,
+                                   verbose=False,
+                                   save_gif=False)
+
+
 class TestD3QN(unittest.TestCase):
     def test_train_eval(self):
         tmp_dir = tempfile.mkdtemp()
diff --git a/l2rpn_baselines/utils/BaseDeepQ.py b/l2rpn_baselines/utils/BaseDeepQ.py
index a9c54d3..5a1f1cd 100644
--- a/l2rpn_baselines/utils/BaseDeepQ.py
+++ b/l2rpn_baselines/utils/BaseDeepQ.py
@@ -108,38 +108,60 @@ def construct_q_network(self):
         """
         raise NotImplementedError("Not implemented")
 
-    def predict_movement(self, data, epsilon, batch_size=None):
+    def predict_movement(self, data, epsilon, batch_size=None, training=False):
         """
         Predict movement of game controler where is epsilon probability randomly move."""
         if batch_size is None:
             batch_size = data.shape[0]
 
-        rand_val = np.random.random(batch_size)
-        q_actions = self._model.predict(data, batch_size=batch_size)
-
-        opt_policy = np.argmax(np.abs(q_actions), axis=-1)
-        opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
-        return opt_policy, q_actions[0, opt_policy]
+        # q_actions = self._model.predict(data, batch_size=batch_size)  # q value of each action
+        q_actions = self._model(data, training=training).numpy()
+        opt_policy = np.argmax(q_actions, axis=-1)
+        if epsilon > 0.:
+            rand_val = np.random.random(batch_size)
+            opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
+        return opt_policy, q_actions[np.arange(batch_size), opt_policy], q_actions
 
     def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
-        """Trains network to fit given parameters"""
+        """Trains network to fit given parameters:
+        Parameters
+        ----------
+        s_batch:
+            the state vector (before the action is taken)
+        a_batch:
+            the action taken
+        s2_batch:
+            the state vector (after the action is taken)
+        d_batch:
+            says whether or not the episode was over
+        r_batch:
+            the reward obtained this step
+
+        see https://towardsdatascience.com/dueling-double-deep-q-learning-using-tensorflow-2-x-7bbbcec06a2a
+        for the update rules
+        """
         if batch_size is None:
             batch_size = s_batch.shape[0]
 
         # Save the graph just the first time
         if tf_writer is not None:
             tf.summary.trace_on()
-        targets = self._model.predict(s_batch, batch_size=batch_size)
+        target = self._model(s_batch, training=True).numpy()
+        fut_action = self._model(s2_batch, training=True).numpy()
         if tf_writer is not None:
             with tf_writer.as_default():
                 tf.summary.trace_export("model-graph", 0)
             tf.summary.trace_off()
-        fut_action = self._target_model.predict(s2_batch, batch_size=batch_size)
-
-        targets[:, a_batch.flatten()] = r_batch
-        targets[d_batch, a_batch[d_batch]] += self._training_param.discount_factor * np.max(fut_action[d_batch], axis=-1)
-
-        loss = self.train_on_batch(self._model, self._optimizer_model, s_batch, targets)
+        target_next = self._target_model(s2_batch, training=True).numpy()
+
+        idx = np.arange(batch_size)
+        target[idx, a_batch] = r_batch
+        # update the value for not done episode
+        nd_batch = ~d_batch  # update with this rule only batch that did not game over
+        next_a = np.argmax(fut_action, axis=-1)  # compute the future action i will take in the next state
+        fut_Q = target_next[idx, next_a]  # get its Q value
+        target[nd_batch, a_batch[nd_batch]] += self._training_param.discount_factor * fut_Q[nd_batch]
+        loss = self.train_on_batch(self._model, self._optimizer_model, s_batch, target)
         return loss
 
     def train_on_batch(self, model, optimizer_model, x, y_true):
@@ -213,14 +235,22 @@ def load_network(self, path, name=None, ext="h5"):
         if self.verbose:
             print("Succesfully loaded network.")
 
-    def target_train(self):
+    def target_train(self, tau=None):
         """
         update the target model with the parameters given in the :attr:`BaseDeepQ._training_param`.
         """
-        # nothing has changed from the original implementation
-        model_weights = self._model.get_weights()
-        target_model_weights = self._target_model.get_weights()
-        for i in range(len(model_weights)):
-            target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \
-                                      target_model_weights[i]
-        self._target_model.set_weights(target_model_weights)
\ No newline at end of file
+        if tau is None:
+            tau = self._training_param.tau
+        tau_inv = 1.0 - tau
+
+        target_params = self._target_model.trainable_variables
+        source_params = self._model.trainable_variables
+        for src, dest in zip(source_params, target_params):
+            # Polyak averaging
+            var_update = src.value() * tau
+            var_persist = dest.value() * tau_inv
+            dest.assign(var_update + var_persist)
+
+    def save_tensorboard(self, current_step):
+        """function used to save other information to tensorboard"""
+        pass
diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py
index 18637a4..2bcfb25 100644
--- a/l2rpn_baselines/utils/DeepQAgent.py
+++ b/l2rpn_baselines/utils/DeepQAgent.py
@@ -135,9 +135,6 @@ def __init__(self,
         self.store_action = store_action
         self.dict_action = {}
         self.istraining = istraining
-        self._actions_per_1000steps = np.zeros((1000, self.action_space.size()), dtype=np.int)
-        self._illegal_actions_per_1000steps = np.zeros(1000, dtype=np.int)
-        self._ambiguous_actions_per_1000steps = np.zeros(1000, dtype=np.int)
         self.epsilon = 1.0
 
         # for tensorbaord
@@ -166,9 +163,6 @@ def __init__(self,
         # this is for the "limit the episode length" depending on your previous success
         self._total_sucesses = 0
 
-        # update frequency of action types
-        self._nb_updated_act_tensorboard = None
-
         # neural network architecture
         self._nn_archi = nn_archi
 
@@ -182,6 +176,23 @@ def __init__(self,
         else:
             self.init_obs_extraction(observation_space)
 
+        # for the frequency of action type
+        self.current_ = 0
+        self.nb_ = 10
+        self._nb_this_time = np.zeros((self.nb_, 6))
+
+        #
+        self._vector_size = None
+        self._actions_per_ksteps = None
+        self._illegal_actions_per_ksteps = None
+        self._ambiguous_actions_per_ksteps = None
+
+    def _fill_vectors(self, training_param):
+        self._vector_size  = self.nb_ * training_param.update_tensorboard_freq
+        self._actions_per_ksteps = np.zeros((self._vector_size, self.action_space.size()), dtype=np.int)
+        self._illegal_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int)
+        self._ambiguous_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int)
+
     # grid2op.Agent interface
     def convert_obs(self, observation):
         """
@@ -232,7 +243,9 @@ def my_act(self, transformed_observation, reward, done=False):
             The id the action taken.
 
         """
-        predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation, epsilon=0.0)
+        predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation,
+                                                                epsilon=0.0,
+                                                                training=False)
         res = int(predict_movement_int)
         self._store_action_played(res)
         return res
@@ -389,6 +402,7 @@ def train(self,
         else:
             training_param = self._training_param
         self._init_deep_q(self._training_param, env)
+        self._fill_vectors(self._training_param)
 
         self._init_replay_buffer()
 
@@ -410,7 +424,6 @@ def train(self,
         UPDATE_FREQ = training_param.update_tensorboard_freq  # update tensorboard every "UPDATE_FREQ" steps
         SAVING_NUM = training_param.save_model_each
 
-    
         if hasattr(env, "nb_env"):
             nb_env = env.nb_env
             warnings.warn("Training using {} environments".format(nb_env))
@@ -478,8 +491,6 @@ def train(self,
         self._prev_id = 0
         # this is for the "limit the episode length" depending on your previous success
         self._total_sucesses = 0
-        # update the frequency of action types
-        self._nb_updated_act_tensorboard = 0
 
         with tqdm(total=iterations - training_step, disable=not self.verbose) as pbar:
             while training_step < iterations:
@@ -491,7 +502,7 @@ def train(self,
                 self.epsilon = self._training_param.get_next_epsilon(current_step=training_step)
 
                 # then we need to predict the next moves. Agents have been adapted to predict a batch of data
-                pm_i, pq_v, act = self._next_move(initial_state, self.epsilon)
+                pm_i, pq_v, act = self._next_move(initial_state, self.epsilon, training_step)
 
                 # todo store the illegal / ambiguous / ... actions
                 reward, done = self._init_local_train_loop()
@@ -501,7 +512,6 @@ def train(self,
                     act = act[0]
 
                 temp_observation_obj, temp_reward, temp_done, info = env.step(act)
-
                 if self.__nb_env == 1:
                     # dirty hack to wrap them into list
                     temp_observation_obj = [temp_observation_obj]
@@ -530,7 +540,6 @@ def train(self,
                 alive_frames[epoch_num] = np.mean(alive_frame)
                 total_rewards[epoch_num] = np.mean(total_reward)
                 self._store_action_played_train(training_step, pm_i)
-
                 self._save_tensorboard(training_step, epoch_num, UPDATE_FREQ, total_rewards, alive_frames)
                 training_step += 1
                 pbar.update(1)
@@ -549,36 +558,40 @@ def _convert_obs_train(self, observations):
             self._obs_as_vect[i, :] = self.convert_obs(obs).reshape(-1)
         return self._obs_as_vect
 
+    def _create_action_if_not_registered(self, action_int):
+        """make sure that `action_int` is present in dict_action"""
+        if action_int not in self.dict_action:
+            act = self.action_space.all_actions[action_int]
+            is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn = False, False, False, False, False, False
+            try:
+                # feature unavailble in grid2op <= 0.9.2
+                is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types()
+                is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp)
+            except Exception as exc_:
+                pass
+
+            self.dict_action[action_int] = [0, act,
+                                            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn)]
+
     def _store_action_played(self, action_int):
         """if activated, this function will store the action taken by the agent."""
         if self.store_action:
-            if action_int not in self.dict_action:
-                act = self.action_space.all_actions[action_int]
-                is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn = False, False, False, False, False, False
-                try:
-                    # feature unavailble in grid2op <= 0.9.2
-                    is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types()
-                    is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp)
-                except Exception as exc_:
-                    pass
-
-                self.dict_action[action_int] = [0, act,
-                                                (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn)]
-                self.dict_action[action_int][0] += 1
-
-                (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn) = self.dict_action[action_int][2]
-                if is_inj:
-                    self.nb_injection += 1
-                if is_volt:
-                    self.nb_voltage += 1
-                if is_topo:
-                    self.nb_topology += 1
-                if is_line_status:
-                    self.nb_line += 1
-                if is_redisp:
-                    self.nb_redispatching += 1
-                if is_dn:
-                    self.nb_do_nothing += 1
+            self._create_action_if_not_registered(action_int)
+
+            self.dict_action[action_int][0] += 1
+            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn) = self.dict_action[action_int][2]
+            if is_inj:
+                self.nb_injection += 1
+            if is_volt:
+                self.nb_voltage += 1
+            if is_topo:
+                self.nb_topology += 1
+            if is_line_status:
+                self.nb_line += 1
+            if is_redisp:
+                self.nb_redispatching += 1
+            if is_dn:
+                self.nb_do_nothing += 1
 
     def _convert_all_act(self, act_as_integer):
         """this function converts the action given as a list of integer. It ouputs a list of valid grid2op Action"""
@@ -611,6 +624,7 @@ def _train_model(self, training_step):
         self._training_param.tell_step(training_step)
         if training_step > max(self._training_param.min_observation, self._training_param.minibatch_size) and \
             self._training_param.do_train():
+
             # train the model
             s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample(self._training_param.minibatch_size)
             tf_writer = None
@@ -630,14 +644,15 @@ def _train_model(self, training_step):
 
     def _updage_illegal_ambiguous(self, curr_step, info):
         """update the conunt of illegal and ambiguous actions"""
-        self._illegal_actions_per_1000steps[curr_step % 1000] = np.sum([el["is_illegal"] for el in info])
-        self._ambiguous_actions_per_1000steps[curr_step % 1000] = np.sum([el["is_ambiguous"] for el in info])
+        tmp_ = curr_step % self._vector_size
+        self._illegal_actions_per_ksteps[tmp_] = np.sum([el["is_illegal"] for el in info])
+        self._ambiguous_actions_per_ksteps[tmp_] = np.sum([el["is_ambiguous"] for el in info])
 
     def _store_action_played_train(self, training_step, action_id):
         """store which action were played, for tensorboard only."""
-        which_row = training_step % 1000
-        self._actions_per_1000steps[which_row, :] = 0
-        self._actions_per_1000steps[which_row, action_id] += 1
+        which_row = training_step % self._vector_size
+        self._actions_per_ksteps[which_row, :] = 0
+        self._actions_per_ksteps[which_row, action_id] += 1
 
     def _fast_forward_env(self, env, time=7*24*60/5):
         """use this functio to skip some time steps when environment is reset."""
@@ -674,7 +689,8 @@ def _reset_env_clean_state(self, env):
 
     def _need_reset(self, env, observation_num, epoch_num, done, new_state):
         """perform the proper reset of the environment"""
-        if self._training_param.step_increase_nb_iter > 0:
+        if self._training_param.step_increase_nb_iter is not None and \
+           self._training_param.step_increase_nb_iter > 0:
             self._max_iter_env(min(max(self._training_param.min_iter,
                                        self._training_param.max_iter_fun(self._total_sucesses)),
                                    self._training_param.max_iter))  # TODO
@@ -748,19 +764,27 @@ def _init_replay_buffer(self):
     def _store_new_state(self, initial_state, predict_movement_int, reward, done, new_state):
         """store the new state in the replay buffer"""
         # vectorized version of the previous code
-        for i_s, pm_i, reward, done, new_state in zip(initial_state, predict_movement_int, reward, done, new_state):
+        for i_s, pm_i, reward, done, ns in zip(initial_state, predict_movement_int, reward, done, new_state):
             self.replay_buffer.add(i_s,
                                    pm_i,
                                    reward,
                                    done,
-                                   new_state)
+                                   ns)
 
     def _max_iter_env(self, new_max_iter):
         """update the number of maximum iteration allowed."""
         self._max_iter_env_ = new_max_iter
 
-    def _next_move(self, curr_state, epsilon):
-        pm_i, pq_v = self.deep_q.predict_movement(curr_state, epsilon)
+    def _next_move(self, curr_state, epsilon, training_step):
+        # supposes that 0 encodes for do nothing, otherwise it will NOT work (for the observer)
+        pm_i, pq_v, q_actions = self.deep_q.predict_movement(curr_state, epsilon, training=True)
+
+        if self._training_param.min_observe is not None and \
+                training_step < self._training_param.min_observe:
+            # action is replaced by do nothing due to the "observe only" specification
+            pm_i[:] = 0
+            pq_v[:] = q_actions[:, 0]
+        # TODO implement the "max XXX random action per scenarios"
         act = self._convert_all_act(pm_i)
         return pm_i, pq_v, act
 
@@ -839,12 +863,12 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
                 mean_reward_100 = mean_reward
                 mean_alive_100 = mean_alive
 
-                tmp = self._actions_per_1000steps > 0
+                tmp = self._actions_per_ksteps > 0
                 tmp = tmp.sum(axis=0)
-                nb_action_taken_last_1000_step = np.sum(tmp > 0)
+                nb_action_taken_last_kstep = np.sum(tmp > 0)
 
-                nb_illegal_act = np.sum(self._illegal_actions_per_1000steps)
-                nb_ambiguous_act = np.sum(self._ambiguous_actions_per_1000steps)
+                nb_illegal_act = np.sum(self._illegal_actions_per_ksteps)
+                nb_ambiguous_act = np.sum(self._ambiguous_actions_per_ksteps)
 
                 if epoch_num >= 100:
                     mean_reward_100 = np.nanmean(epoch_rewards[(epoch_num-100):epoch_num])
@@ -861,70 +885,119 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a
 
                 # show first the Mean reward and mine time alive (hence the upper case)
                 tf.summary.scalar("Mean_alive_30", mean_alive_30, step_tb,
-                                  description="Mean reward over the last 30 epochs")
+                                  description="Average number of steps (per episode) made over the last 30 "
+                                              "completed episodes")
                 tf.summary.scalar("Mean_reward_30", mean_reward_30, step_tb,
-                                  description="Mean number of timesteps sucessfully manage the last 30 epochs")
+                                  description="Average (final) reward obtained over the last 30 completed episodes")
 
                 # then it's alpha numerical order, hence the "z_" in front of some information
                 tf.summary.scalar("loss", self._losses[step], step_tb,
-                                  description="last training loss")
+                                  description="Training loss (for the last training batch)")
 
                 tf.summary.scalar("last_alive", last_alive, step_tb,
-                                  description="last number of timestep during which the agent stayed alive")
+                                  description="Final number of steps for the last complete episode")
                 tf.summary.scalar("last_reward", last_reward, step_tb,
-                                  description="last reward get by the agent")
+                                  description="Final reward over the last complete episode")
 
-                tf.summary.scalar("mean_reward", mean_reward, step_tb)
-                tf.summary.scalar("mean_alive", mean_alive, step_tb)
+                tf.summary.scalar("mean_reward", mean_reward, step_tb,
+                                  description="Average reward over the whole episodes played")
+                tf.summary.scalar("mean_alive", mean_alive, step_tb,
+                                  description="Average time alive over the whole episodes played")
 
                 tf.summary.scalar("mean_reward_100", mean_reward_100, step_tb,
-                                  description="Mean reward over the last 100 epochs")
+                                  description="Average number of steps (per episode) made over the last 100 "
+                                              "completed episodes")
                 tf.summary.scalar("mean_alive_100", mean_alive_100, step_tb,
-                                  description="Mean number of timesteps sucessfully manage the last 100 epochs")
+                                  description="Average (final) reward obtained over the last 100 completed episodes")
 
-                tf.summary.scalar("nb_differentaction_taken_1000", nb_action_taken_last_1000_step, step_tb,
-                                  description="Number of different actions played the past 1000 steps")
+                tf.summary.scalar("nb_different_action_taken", nb_action_taken_last_kstep, step_tb,
+                                  description="Number of different actions played the last "
+                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                 tf.summary.scalar("nb_illegal_act", nb_illegal_act, step_tb,
-                                  description="Number of illegal actions played the past 1000 steps")
+                                  description="Number of illegal actions played the last "
+                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                 tf.summary.scalar("nb_ambiguous_act", nb_ambiguous_act, step_tb,
-                                  description="Number of ambiguous actions played the past 1000 steps")
+                                  description="Number of ambiguous actions played the last "
+                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                 tf.summary.scalar("nb_total_success", self._total_sucesses, step_tb,
-                                  description="Number of times I reach the end of scenario (no game over)")
+                                  description="Number of times the episode was completed entirely "
+                                              "(no game over)")
 
                 tf.summary.scalar("z_lr", self._train_lr, step_tb,
-                                  description="current learning rate")
+                                  description="Current learning rate")
                 tf.summary.scalar("z_epsilon", self.epsilon, step_tb,
-                                  description="current epsilon (of the epsilon greedy)")
+                                  description="Current epsilon (from the epsilon greedy)")
                 tf.summary.scalar("z_max_iter", self._max_iter_env_, step_tb,
-                                  description="maximum number of time steps before deciding a scenario is over (=win)")
+                                  description="Maximum number of time steps before deciding a scenario "
+                                              "is over (=win)")
                 tf.summary.scalar("z_total_episode", epoch_num, step_tb,
-                                  description="total number of episode played (~number of \"reset\")")
+                                  description="Total number of episode played (number of \"reset\")")
+
+                self.deep_q.save_tensorboard(step_tb)
 
                 if self.store_action:
-                    nb_ = 10  # reset the frequencies every nb_ saving
-                    self._nb_updated_act_tensorboard += UPDATE_FREQ
-                    tf.summary.scalar("zz_freq_inj", self.nb_injection / self._nb_updated_act_tensorboard, step_tb)
-                    tf.summary.scalar("zz_freq_voltage", self.nb_voltage / self._nb_updated_act_tensorboard, step_tb)
-                    tf.summary.scalar("z_freq_topo", self.nb_topology / self._nb_updated_act_tensorboard, step_tb)
-                    tf.summary.scalar("z_freq_line_status", self.nb_line / self._nb_updated_act_tensorboard, step_tb)
-                    tf.summary.scalar("z_freq_redisp", self.nb_redispatching / self._nb_updated_act_tensorboard, step_tb)
-                    tf.summary.scalar("z_freq_do_nothing", self.nb_do_nothing / self._nb_updated_act_tensorboard, step_tb)
-                    if step % (nb_ * UPDATE_FREQ) == 0:
-                        self.nb_injection = 0
-                        self.nb_voltage = 0
-                        self.nb_topology = 0
-                        self.nb_line = 0
-                        self.nb_redispatching = 0
-                        self.nb_do_nothing = 0
-                        self._nb_updated_act_tensorboard = 0
+                    self._store_frequency_action_type(UPDATE_FREQ, step_tb)
 
                 if self._time_step_lived is not None:
                     tf.summary.histogram(
                         "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
-                        description="number of time steps lived for all scenarios"
+                        description="Number of time steps lived for all scenarios"
                     )
                 if self._nb_chosen is not None:
                     tf.summary.histogram(
                         "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
-                        description="number of times this scenarios has been played"
+                        description="Number of times this scenarios has been played"
                     )
+
+    def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
+        self.current_ += 1
+        self.current_ %= self.nb_
+        nb_inj, nb_volt, nb_topo, nb_line, nb_redisp, nb_dn = self._nb_this_time[self.current_, :]
+        self._nb_this_time[self.current_, :] = [self.nb_injection, self.nb_voltage,
+                                                self.nb_topology, self.nb_line,
+                                                self.nb_redispatching, self.nb_do_nothing]
+
+        curr_inj = self.nb_injection - nb_inj
+        curr_volt = self.nb_voltage - nb_volt
+        curr_topo = self.nb_topology - nb_topo
+        curr_line = self.nb_line - nb_line
+        curr_redisp = self.nb_redispatching - nb_redisp
+        curr_dn = self.nb_do_nothing - nb_dn
+
+        total_act_num = curr_inj + curr_volt + curr_topo + curr_line + curr_redisp + curr_dn
+        tf.summary.scalar("zz_freq_inj",
+                          curr_inj / total_act_num,
+                          step_tb,
+                          description="Frequency of \"injection\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("zz_freq_voltage",
+                          curr_volt / total_act_num,
+                          step_tb,
+                          description="Frequency of \"voltage\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("z_freq_topo",
+                          curr_topo / total_act_num,
+                          step_tb,
+                          description="Frequency of \"topo\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("z_freq_line_status",
+                          curr_line / total_act_num,
+                          step_tb,
+                          description="Frequency of \"line status\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("z_freq_redisp",
+                          curr_redisp / total_act_num,
+                          step_tb,
+                          description="Frequency of \"redispatching\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
+        tf.summary.scalar("z_freq_do_nothing",
+                          curr_dn / total_act_num,
+                          step_tb,
+                          description="Frequency of \"do nothing\" actions "
+                                      "type played over the last {} actions"
+                                      "".format(self.nb_ * UPDATE_FREQ))
diff --git a/l2rpn_baselines/utils/NNParam.py b/l2rpn_baselines/utils/NNParam.py
index f6df6c1..8294d9a 100644
--- a/l2rpn_baselines/utils/NNParam.py
+++ b/l2rpn_baselines/utils/NNParam.py
@@ -7,6 +7,10 @@
 # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
 import os
 import json
+import numpy as np
+from collections.abc import Iterable
+
+import grid2op
 from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ
 
 
@@ -115,13 +119,38 @@ def to_dict(self):
 
         for attr_nm in self._list_float:
             tmp = getattr(self, attr_nm)
-            res[attr_nm] = [float(el) for el in tmp]
+            res[attr_nm] = self._convert_list_to_json(tmp, float)
         for attr_nm in self._list_int:
             tmp = getattr(self, attr_nm)
-            res[attr_nm] = [int(el) for el in tmp]
+            res[attr_nm] = self._convert_list_to_json(tmp, int)
         for attr_nm in self._list_str:
             tmp = getattr(self, attr_nm)
-            res[attr_nm] = [str(el) for el in tmp]
+            res[attr_nm] = self._convert_list_to_json(tmp, str)
+        return res
+
+    @classmethod
+    def _convert_list_to_json(cls, obj, type_):
+        if isinstance(obj, type_):
+            res = obj
+        elif isinstance(obj, np.ndarray):
+            if len(obj.shape) == 1:
+                res = [type_(el) for el in obj]
+            else:
+                res = [cls._convert_list_to_json(el, type_) for el in obj]
+        elif isinstance(obj, Iterable):
+            res = [cls._convert_list_to_json(el, type_) for el in obj]
+        else:
+            res = type_(obj)
+        return res
+
+    @classmethod
+    def _attr_from_json(cls, json, type_):
+        if isinstance(json, type_):
+            res = json
+        elif isinstance(json, list):
+            res = [cls._convert_list_to_json(obj=el, type_=type_) for el in json]
+        else:
+            res = type_(json)
         return res
 
     @classmethod
@@ -155,13 +184,13 @@ def from_dict(cls, tmp):
 
         for attr_nm in cls._list_float:
             if attr_nm in tmp:
-                cls_as_dict[attr_nm] = [float(el) for el in tmp[attr_nm]]
+                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], float)
         for attr_nm in cls._list_int:
             if attr_nm in tmp:
-                cls_as_dict[attr_nm] = [int(el) for el in tmp[attr_nm]]
+                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], int)
         for attr_nm in cls._list_str:
             if attr_nm in tmp:
-                cls_as_dict[attr_nm] = [str(el) for el in tmp[attr_nm]]
+                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], str)
 
         res = cls(**cls_as_dict)
         return res
@@ -189,3 +218,60 @@ def save_as_json(self, path, name=None):
         path_out = os.path.join(path, name)
         with open(path_out, "w", encoding="utf-8") as f:
             json.dump(res, fp=f, indent=4, sort_keys=True)
+
+    def center_reduce(self, env):
+        """currently not implemented for this class, "coming soon" as we might say"""
+        # TODO see TestLeapNet for this feature
+        self._center_reduce_vect(env.get_obs(), "x")
+
+    def _center_reduce_vect(self, obs, nn_part):
+        """
+        compute the xxxx_adds and xxxx_mults for one part of the neural network called nn_part,
+        depending on what attribute of the observation is extracted
+        """
+        if not isinstance(obs, grid2op.Observation.BaseObservation):
+            # in multi processing i receive a set of observation there so i might need
+            # to extract only the first one
+            obs = obs[0]
+
+        li_attr_obs = getattr(self, "list_attr_obs_{}".format(nn_part))
+        adds = []
+        mults = []
+        for attr_nm in li_attr_obs:
+            if attr_nm in ["prod_p"]:
+                add_tmp = np.array([-0.5*(pmax + pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
+                mult_tmp = np.array([1./max((pmax - pmin), 0.) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
+            elif attr_nm in ["prod_q"]:
+                add_tmp = 0.
+                mult_tmp = np.array([1./max(abs(val), 1.0) for val in obs.prod_q])
+            elif attr_nm in ["load_p", "load_q"]:
+                add_tmp = np.array([-val for val in getattr(obs, attr_nm)])
+                mult_tmp = 0.5
+            elif attr_nm in ["load_v", "prod_v", "v_or", "v_ex"]:
+                add_tmp = 0.
+                mult_tmp = np.array([1. / val for val in getattr(obs, attr_nm)])
+            elif attr_nm == "hour_of_day":
+                add_tmp = -12.
+                mult_tmp = 1.0/12
+            elif attr_nm == "minute_of_hour":
+                add_tmp = -30.
+                mult_tmp = 1.0/30
+            elif attr_nm == "day_of_week":
+                add_tmp = -4.
+                mult_tmp = 1.0/4
+            elif attr_nm == "day":
+                add_tmp = -15.
+                mult_tmp = 1.0/15.
+            elif attr_nm in ["target_dispatch", "actual_dispatch"]:
+                add_tmp = 0.
+                mult_tmp = np.array([1./(pmax - pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
+            elif attr_nm in ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex"]:
+                add_tmp = 0.
+                mult_tmp = np.array([1.0 / max(val, 1.0) for val in getattr(obs, attr_nm)])
+            else:
+                add_tmp = 0.
+                mult_tmp = 1.0
+            mults.append(mult_tmp)
+            adds.append(add_tmp)
+        setattr(self, "{}_adds".format(nn_part), adds)
+        setattr(self, "{}_mults".format(nn_part), mults)
diff --git a/l2rpn_baselines/utils/ReplayBuffer.py b/l2rpn_baselines/utils/ReplayBuffer.py
index 68164b4..f88864a 100644
--- a/l2rpn_baselines/utils/ReplayBuffer.py
+++ b/l2rpn_baselines/utils/ReplayBuffer.py
@@ -12,6 +12,7 @@
 from collections import deque
 import numpy as np
 import random
+import copy
 import pdb
 
 
@@ -34,6 +35,7 @@ def add(self, s, a, r, d, s2):
             raise RuntimeError("Infinite value somwhere in at least one of the state")
 
         experience = (s, a, r, d, s2)
+        experience = copy.deepcopy(experience)
         if self.count < self.buffer_size:
             self.buffer.append(experience)
             self.count += 1
diff --git a/l2rpn_baselines/utils/TrainingParam.py b/l2rpn_baselines/utils/TrainingParam.py
index 1689dec..551ebea 100644
--- a/l2rpn_baselines/utils/TrainingParam.py
+++ b/l2rpn_baselines/utils/TrainingParam.py
@@ -114,11 +114,11 @@ class TrainingParam(object):
     _tol_float_equal = float(1e-8)
 
     _int_attr = ["buffer_size", "minibatch_size", "step_for_final_epsilon",
-                  "min_observation", "last_step", "num_frames", "update_freq",
+                 "min_observation", "last_step", "num_frames", "update_freq",
                  "min_iter", "max_iter", "update_tensorboard_freq", "save_model_each", "_update_nb_iter",
-                 "step_increase_nb_iter"]
+                 "step_increase_nb_iter", "min_observe"]
     _float_attr = ["_final_epsilon", "_initial_epsilon", "lr", "lr_decay_steps", "lr_decay_rate",
-                    "discount_factor", "tau", "oversampling_rate",
+                   "discount_factor", "tau", "oversampling_rate",
                    "max_global_norm_grad", "max_value_grad", "max_loss"]
 
     def __init__(self,
@@ -145,7 +145,11 @@ def __init__(self,
                  oversampling_rate=None,
                  max_global_norm_grad=None,
                  max_value_grad=None,
-                 max_loss=None
+                 max_loss=None,
+
+                 # observer: let the neural network "observe" for a given amount of time
+                 # all actions are replaced by a do nothing
+                 min_observe=None,
                  ):
 
         self.random_sample_datetime_start = random_sample_datetime_start
@@ -165,6 +169,9 @@ def __init__(self,
         self.max_value_grad = max_value_grad
         self.max_loss = max_loss
 
+        # observer
+        self.min_observe = min_observe
+
         self.last_step = int(0)
         self.num_frames = int(num_frames)
         self.discount_factor = float(discount_factor)
@@ -267,9 +274,9 @@ def to_dict(self):
 
     @staticmethod
     def from_dict(tmp):
-        """initialize this instance from a dictionnary"""
+        """initialize this instance from a dictionary"""
         if not isinstance(tmp, dict):
-            raise RuntimeError("TrainingParam from dict must be called with a dictionnary, and not {}".format(tmp))
+            raise RuntimeError("TrainingParam from dict must be called with a dictionary, and not {}".format(tmp))
         res = TrainingParam()
         for attr_nm in TrainingParam._int_attr:
             if attr_nm in tmp: