diff --git a/.gitignore b/.gitignore index 579d04d..89d4206 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,11 @@ l2rpn_baselines/DeepQSimple/saved_baseline/ l2rpn_baselines/DuelQLeapNet/logs-eval/ l2rpn_baselines/DuelQSimple/saved_baseline/ l2rpn_baselines/SAC/saved_baseline/ - +l2rpn_baselines/TestLeapNet/model_saved/ +l2rpn_baselines/TestLeapNet/tf_logs/ +l2rpn_baselines/TestLeapNet/logs-eval/ +l2rpn_baselines/LeapNetEncoded/logs-eval/ +l2rpn_baselines/LeapNetEncoded/model_saved/ +l2rpn_baselines/LeapNetEncoded/tf_logs/ +l2rpn_baselines/LeapNetEncoded/tf_logs_test/ +l2rpn_baselines/LeapNetEncoded/model_test/ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 33234a2..32cf9de 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,18 @@ Change Log -------- - stack multiple states in `utils/DeepQAgent` +[0.5.0] - 2020-08-?? +-------------------- +- [FIXED] the counting of the action types frequency in tensorboard (for some baselines) +- [FIXED] a broken Replay buffer `utils.ReplayBuffer` (used in some baselines) +- [FIXED] a bug in using multiple environments for some baselines +- [FIXED] wrong q value update for some baselines +- [IMPROVED] descriptions and computation of the tensorboard information (for some baselines) +- [IMPROVED] performance optimization for training and usage of some baselines +- [ADDED] better serializing as json of the `utils.NNParam` class +- [ADDED] the LeapNetEncoded baselines that uses a leap neural network (leap net) to create an + embedding of the state of the powergrid. + [0.4.4] - 2020-07-07 -------------------- - [FIXED] now the baselines can fully support the grid2op MultiMix environment. diff --git a/docs/LeapNetEncoded.rst b/docs/LeapNetEncoded.rst new file mode 100644 index 0000000..92a3674 --- /dev/null +++ b/docs/LeapNetEncoded.rst @@ -0,0 +1,50 @@ +LeapNetEncoded: D3QN on a state encoded by a leap net +====================================================== + +TODO reference the original papers `ESANN Paper `_ +`Leap Net `_ + +That has now be implemented as a github repository `Leap Net Github `_ + +Description +----------- +The Leap is a type of neural network that has showed really good performances on the predictions of flows on +powerlines based on the injection and the topology. + +In this baseline, we use this very same architecture to model encode the powergrid state (at a given +step). + +Then this embedding of the powergrid is used by a neural network (that can be a regular network or +a leap net) that parametrized the Q function. + +An example to train this model is available in the train function :ref:`Example-leapnetenc`. + +Exported class +-------------- +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.LeapNetEncoded import train, evaluate, LeapNetEncoded + +.. automodule:: l2rpn_baselines.LeapNetEncoded + :members: + :autosummary: + +Other non exported class +------------------------ +These classes need to be imported, if you want to import them with (non exhaustive list): + +.. code-block:: python + + from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN + from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam + + +.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN.LeapNetEncoded_NN + :members: + :autosummary: + +.. autoclass:: l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam + :members: + :autosummary: diff --git a/docs/SAC.rst b/docs/SAC.rst index 772368e..688dc0f 100644 --- a/docs/SAC.rst +++ b/docs/SAC.rst @@ -4,6 +4,9 @@ SAC: Soft Actor Critic This baseline comes from the paper: `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor `_ +**NB** This version is a new implementation of the SAC baselines. We recommend you to start using +it in new projects. The old version had some issues. Out of backward compatibility, it is still +available under the name "SACOld". Description ----------- diff --git a/docs/SACOld.rst b/docs/SACOld.rst new file mode 100644 index 0000000..3cf6237 --- /dev/null +++ b/docs/SACOld.rst @@ -0,0 +1,44 @@ +SAC: Soft Actor Critic +========================= + +This baseline comes from the paper: +`Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor `_ + + +Description +----------- +This module proposes an implementation of the SAC algorithm. + +**This is an old implementation that is probably not correct, it was included out of +backward compatibility with earlier version (< 0.5.0) of this package** + +An example to train this model is available in the train function :ref:`Example-sacold`. + +Exported class +-------------- +You can use this class with: + +.. code-block:: python + + from l2rpn_baselines.SACOld import train, evaluate, SACOld + +.. automodule:: l2rpn_baselines.SACOld + :members: + :autosummary: + +Other non exported class +------------------------ +These classes need to be imported, if you want to import them with (non exhaustive list): +.. code-block:: python + + from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN + from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam + + +.. autoclass:: l2rpn_baselines.SACOld.SACOld_NN.SACOld_NN + :members: + :autosummary: + +.. autoclass:: l2rpn_baselines.SACOld.SACOld_NNParam.SACOld_NNParam + :members: + :autosummary: diff --git a/docs/index.rst b/docs/index.rst index 6492796..cc7641c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,6 +39,16 @@ More advanced baselines DuelQLeapNet DoubleDuelingRDQN + LeapNetEncoded + + +Deprecated baselines +--------------------------- + +.. toctree:: + :maxdepth: 2 + + SACOld Contributions diff --git a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py index 96c91c8..caf77d8 100644 --- a/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py +++ b/l2rpn_baselines/DuelQLeapNet/DuelQLeapNet_NN.py @@ -165,13 +165,13 @@ def _make_x_tau(self, data): res = [data_x, *data_tau] return res - def predict_movement(self, data, epsilon, batch_size=None): + def predict_movement(self, data, epsilon, batch_size=None, training=False): """Predict movement of game controler where is epsilon probability randomly move.""" if batch_size is None: batch_size = data.shape[0] data_split = self._make_x_tau(data) - res = super().predict_movement(data_split, epsilon=epsilon, batch_size=batch_size) + res = super().predict_movement(data_split, epsilon=epsilon, batch_size=batch_size, training=training) return res def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None): diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py new file mode 100644 index 0000000..6c43364 --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded.py @@ -0,0 +1,22 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + + +from l2rpn_baselines.utils import DeepQAgent + +DEFAULT_NAME = "LeapNetEncoded" + + +class LeapNetEncoded(DeepQAgent): + """ + Inheriting from :class:`l2rpn_baselines.DeepQAgent` this class implements the particular agent used for the + Double Duelling Deep Q network baseline, with the particularity that the Q network is encoded with a leap net. + + It does nothing in particular. + """ + pass diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py new file mode 100644 index 0000000..e30a0bc --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NN.py @@ -0,0 +1,352 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import numpy as np +import os + +# tf2.0 friendly +import warnings + +import tensorflow as tf +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + from tensorflow.keras.models import Sequential, Model + from tensorflow.keras.layers import Activation + from tensorflow.keras.layers import Input, Lambda, subtract, add + import tensorflow.keras.backend as K + +from l2rpn_baselines.utils import BaseDeepQ, TrainingParam + + +# TODO implement that in the leap net package too +from tensorflow.keras.layers import Dense + + +from l2rpn_baselines.DuelQLeapNet.DuelQLeapNet_NN import LtauBis + + +class LeapNetEncoded_NN(BaseDeepQ): + """ + Constructs the desired neural networks. + + More information on the leap net can be found at `Leap Net on Github `_ + + These are: + + - a "state encoder" that uses a leap net to "encode" the observation, or at least the part + related to powergrid + - a q network, that uses the output of the state encoder to predict which action is best. + + The Q network can have other types of input, and can also be a leap net, see the class + :class:`l2rpn_baselines.LeapNetEncoded_NNParam.LeapNetEncoded_NNParam` for more information + + """ + def __init__(self, + nn_params, + training_param=None): + if training_param is None: + training_param = TrainingParam() + BaseDeepQ.__init__(self, + nn_params, + training_param) + self._custom_objects = {"LtauBis": LtauBis} + self._max_global_norm_grad = training_param.max_global_norm_grad + self._max_value_grad = training_param.max_value_grad + self._max_loss = training_param.max_loss + + self.train_lr = 1.0 + + # added + self.encoded_state = None + self.grid_model = None + self._schedule_grid_model = None + self._optimizer_grid_model = None + self._qnet_variables = [] + self.grid_model_losses_npy = None + + self.construct_q_network() + + def construct_q_network(self): + """ + First the :attr:`l2rpn_baselines.BaseDeepQ.nn_archi` parameters are used to create a neural network + to 'encode' the data. Then the leaps occur. + + Afterward the model is split into value an advantage, and treated as usually in any D3QN. + + """ + # Uses the network architecture found in DeepMind paper + # The inputs and outputs size have changed, as well as replacing the convolution by dense layers. + self._model = Sequential() + inputs_x = [Input(shape=(el,), name="x_{}".format(nm_)) for el, nm_ in + zip(self._nn_archi.x_dims, self._nn_archi.list_attr_obs_x)] + inputs_q = [Input(shape=(el,), name="input_q_{}".format(nm_)) for el, nm_ in + zip(self._nn_archi.input_q_dims, self._nn_archi.list_attr_obs_input_q)] + inputs_tau = [Input(shape=(el,), name="tau_{}".format(nm_)) for el, nm_ in + zip(self._nn_archi.tau_dims, self._nn_archi.list_attr_obs_tau)] + input_topo = Input(shape=(2*self._nn_archi.dim_topo,), name="topo") + models_all_inputs = [*inputs_x, *inputs_q, *inputs_tau, input_topo] + + # encode each data type in initial layers + encs_out = [] + for init_val, nm_ in zip(inputs_x, self._nn_archi.list_attr_obs_x): + lay = init_val + for i, size in enumerate(self._nn_archi.sizes_enc): + lay = Dense(size, name="enc_{}_{}".format(nm_, i))(lay) # TODO resnet instead of Dense + lay = Activation("relu")(lay) + encs_out.append(lay) + + # concatenate all that + lay = tf.keras.layers.concatenate(encs_out) + # now "lay" is the encoded observation + + # i do a few layer + for i, size in enumerate(self._nn_archi.sizes_main): + lay = Dense(size, name="main_{}".format(i))(lay) # TODO resnet instead of Dense + lay = Activation("relu")(lay) + + # now i do the leap net to encode the state + encoded_state = tf.keras.layers.add([lay, LtauBis(name="leap_topo")([lay, input_topo])], + name="encoded_state") + self.encoded_state = tf.keras.backend.stop_gradient(encoded_state) + + # i predict the full state of the grid given the "control" variables + outputs_gm = [] + grid_model_losses = {} + lossWeights = {} # TODO + for sz_out, nm_ in zip(self._nn_archi.gm_out_dims, + self._nn_archi.list_attr_obs_gm_out): + lay = encoded_state # carefull i need my gradients here ! (don't use self.encoded_state) + for i, size in enumerate(self._nn_archi.sizes_out_gm): + lay = Dense(size, name="{}_{}".format(nm_, i))(lay) + lay = Activation("relu")(lay) + + # predict now the variable + name_output = "{}_hat".format(nm_) + pred_ = Dense(sz_out, name=name_output)(lay) + outputs_gm.append(pred_) + grid_model_losses[name_output] = "mse" + + # NB grid_model does not use inputs_tau + self.grid_model = Model(inputs=models_all_inputs, outputs=outputs_gm, name="grid_model") + self._schedule_grid_model, self._optimizer_grid_model = self.make_optimiser() + self.grid_model.compile(loss=grid_model_losses, optimizer=self._optimizer_grid_model) # , loss_weights=lossWeights + + # And now let's predict the Q values of each actions given the encoded grid state + input_Qnet = inputs_q + [self.encoded_state] + # TODO do i pre process the data coming from inputs_q ??? + + lay = tf.keras.layers.concatenate(input_Qnet, name="input_Q_network") + for i, size in enumerate(self._nn_archi.sizes_Qnet): + tmp = Dense(size, name="qvalue_{}".format(i)) # TODO resnet instead of Dense + lay = tmp(lay) + lay = Activation("relu")(lay) + self._qnet_variables += tmp.trainable_weights + + # And i predict the Q value of the action + l_tau = lay + for el, nm_ in zip(inputs_tau, self._nn_archi.list_attr_obs_tau): + tmp = LtauBis(name="leap_{}".format(nm_)) + l_tau = l_tau + tmp([lay, el]) + self._qnet_variables += tmp.trainable_weights + + tmp = Dense(self._action_size) + advantage = tmp(l_tau) + self._qnet_variables += tmp.trainable_weights + tmp = Dense(1, name="value") + value = tmp(l_tau) + self._qnet_variables += tmp.trainable_weights + + meaner = Lambda(lambda x: K.mean(x, axis=1)) + mn_ = meaner(advantage) + tmp = subtract([advantage, mn_]) + policy = add([tmp, value], name="policy") + + model_all_outputs = [policy] + self._model = Model(inputs=models_all_inputs, outputs=model_all_outputs) + self._schedule_model, self._optimizer_model = self.make_optimiser() + self._model.compile(loss='mse', optimizer=self._optimizer_model) + + self._target_model = Model(inputs=models_all_inputs, outputs=model_all_outputs) + + def _make_x_tau(self, data): + # for the x's + data_x = [] + prev = 0 + for sz, add_, mul_ in zip(self._nn_archi.x_dims, + self._nn_archi.x_adds, + self._nn_archi.x_mults): + tmp = (data[:, prev:(prev+sz)] + add_) * mul_ + data_x.append(tmp) + prev += sz + + # for the input of the q network + data_q = [] + for sz, add_, mul_ in zip(self._nn_archi.input_q_dims, + self._nn_archi.input_q_adds, + self._nn_archi.input_q_mults): + data_q.append((data[:, prev:(prev+sz)] + add_) * mul_) + prev += sz + + # for the taus + data_tau = [] + for sz, add_, mul_ in zip(self._nn_archi.tau_dims, + self._nn_archi.tau_adds, + self._nn_archi.tau_mults): + data_tau.append((data[:, prev:(prev+sz)] + add_) * mul_) + prev += sz + + # TODO pre process that into different vector + data_topo = self._process_topo(data[:, prev:(prev+self._nn_archi.dim_topo)]) + + prev += self._nn_archi.dim_topo + # TODO predict also gen_q and load_v here, and p_or and q_or and p_ex and q_ex + data_flow = [] + for sz, add_, mul_ in zip(self._nn_archi.gm_out_dims, + self._nn_archi.gm_out_adds, + self._nn_archi.gm_out_mults): + data_flow.append((data[:, prev:(prev+sz)] + add_) * mul_) + prev += sz + + res = [*data_x, *data_q, *data_tau, data_topo], data_flow + return res + + def _process_topo(self, topo_vect): + """process the topology vector. + + As input grid2op encode it: + - -1 disconnected + - 1 connected to bus 1 + - 2 connected to bus 2 + + I transform it in a vector having twice as many component with the encoding, if we move + "by pairs": + - [0,0] -> disconnected + - [1,0] -> connected to bus 1 + - [0,1] -> connected to bus 2 + """ + res = np.zeros((topo_vect.shape[0], 2*topo_vect.shape[1]), + dtype=np.float32) + tmp_ = np.where(topo_vect == 1.) + res[tmp_[0], 2*tmp_[1]] = 1. + tmp_ = np.where(topo_vect == 2.) + res[tmp_[0], 2*tmp_[1]+1] = 1. + return res + + def predict_movement(self, data, epsilon, batch_size=None, training=False): + """Predict movement of game controller where is epsilon + probability randomly move.""" + if batch_size is None: + batch_size = data.shape[0] + data_nn, true_output_grid = self._make_x_tau(data) + res = super().predict_movement(data_nn, epsilon=epsilon, batch_size=batch_size, training=False) + return res + + def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None): + if batch_size is None: + batch_size = s_batch.shape[0] + data_nn, true_output_grid = self._make_x_tau(s_batch) + data_nn2, true_output_grid2 = self._make_x_tau(s2_batch) + + # train the grid model to accurately predict the state of the grid + # TODO predict also gen_q and load_v here, and p_or and q_or and p_ex and q_ex + loss1 = self.grid_model.train_on_batch(data_nn, true_output_grid) + loss2 = self.grid_model.train_on_batch(data_nn2, true_output_grid2) + + # and now train the q network + res = super().train(data_nn, + a_batch, + r_batch, + d_batch, + data_nn2, + tf_writer=tf_writer, + batch_size=batch_size) + + self.grid_model_losses_npy = 0.5*(np.array(loss1) + np.array(loss2)) + return res + + def train_on_batch(self, model, optimizer_model, x, y_true): + """ + clip the loss + """ + with tf.GradientTape() as tape: + # Get y_pred for batch + y_pred = model(x) + # Compute loss for each sample in the batch + # and then clip it + batch_loss = self._clipped_batch_loss(y_true, y_pred) + # Compute mean scalar loss + loss = tf.math.reduce_mean(batch_loss) + loss_npy = loss.numpy() + + # Compute gradients + grads = tape.gradient(loss, self._qnet_variables) + + # clip gradients + if self._max_global_norm_grad is not None: + grads, _ = tf.clip_by_global_norm(grads, self._max_global_norm_grad) + if self._max_value_grad is not None: + grads = [tf.clip_by_value(grad, -self._max_value_grad, self._max_value_grad) + for grad in grads] + + # Apply gradients + optimizer_model.apply_gradients(zip(grads, self._qnet_variables)) + # Store LR + self.train_lr = optimizer_model._decayed_lr('float32').numpy() + + # Return loss scalar + return loss_npy + + def _clipped_batch_loss(self, y_true, y_pred): + sq_error = tf.math.square(y_true - y_pred, name="sq_error") + batch_sq_error = tf.math.reduce_sum(sq_error, axis=1, name="batch_sq_error") + if self._max_loss is not None: + res = tf.clip_by_value(batch_sq_error, 0.0, self._max_loss, name="batch_sq_error_clip") + else: + res = batch_sq_error + return res + + def save_tensorboard(self, current_step): + if self.grid_model_losses_npy is not None: + for i, el in enumerate(self._nn_archi.list_attr_obs_gm_out): + tf.summary.scalar("loss_gridmodel_{}".format(el), + self.grid_model_losses_npy[i], + current_step, + description="Loss of the neural network representing the powergrid " + "for predicting {}" + "".format(el)) + + @staticmethod + def _get_path_model(path, name=None): + if name is None: + path_model = path + else: + path_model = os.path.join(path, name) + path_target_model = "{}_target".format(path_model) + path_grid_model = "{}_grid_model".format(path_model) + return path_model, path_target_model, path_grid_model + + def save_network(self, path, name=None, ext="h5"): + """ + Saves all the models with unique names + """ + path_model, path_target_model, path_grid_model = self._get_path_model(path, name) + self._model.save('{}.{}'.format(path_model, ext)) + self._target_model.save('{}.{}'.format(path_target_model, ext)) + self.grid_model.save('{}.{}'.format(path_grid_model, ext)) + + def load_network(self, path, name=None, ext="h5"): + """ + We load all the models using the keras "load_model" function. + """ + path_model, path_target_model, path_grid_model = self._get_path_model(path, name) + self.construct_q_network() + self._model.load_weights('{}.{}'.format(path_model, ext)) + self._target_model.load_weights('{}.{}'.format(path_target_model, ext)) + self.grid_model.load_weights('{}.{}'.format(path_grid_model, ext)) + if self.verbose: + print("Succesfully loaded network.") \ No newline at end of file diff --git a/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py new file mode 100644 index 0000000..f669770 --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/LeapNetEncoded_NNParam.py @@ -0,0 +1,177 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. +import os +import numpy as np +import copy + +from l2rpn_baselines.utils import NNParam +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN + + +class LeapNetEncoded_NNParam(NNParam): + """ + This class implements the type of parameters used by the DuelQLeapNet model. + + More information on the leap net can be found at `Leap Net on Github `_ + + Attributes + ----------- + list_attr_obs: + currently ot used + sizes: + currently not used + activs: + currently not used + x_dim: + currently not used + + list_attr_obs_x: + list of the attribute of the observation that serve as input of the grid model + (we recommend ["prod_p", "prod_v", "load_p", "load_q"]) + list_attr_obs_gm_out: + list of the attribute of the observation that serve as output for the grid model + (we recommend ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X) + though "rho" can be equally good an improve computation time + list_attr_obs_input_q: + list of the attribute of the observation that serve as input (other that the embedding of the + grid state) for the Q network (we recommend to have here anything "time related" for example + ["time_before_cooldown_line", "time_before_cooldown_sub", "actual_dispatch", + "target_dispatch", "day_of_week", "hour_of_day", "minute_of_hour"] etc. + list_attr_obs_tau: + If you chose to encode your q network as a leap net it self, then you can put here the attribute + you would like the leap net to act on ( ["line_status", "timestep_overflow"] for example) + dim_topo: ``int`` + Dimension of the topology vector (init it with `env.dim_topo`) + + Examples + -------- + All other attributes need to be created once by a call to :func:`LeapNetEncoded_NNParam.compute_dims`: + + ..code-block:: python + + nn_archi.compute_dims(env) + nn_archi.center_reduce(env) + + These calls will set up all the attribute that are not set, and register this model to use + input data approximately in [-1,1] interval. + + + """ + _int_attr = copy.deepcopy(NNParam._int_attr) + _float_attr = copy.deepcopy(NNParam._float_attr) + _str_attr = copy.deepcopy(NNParam._str_attr) + _list_float = copy.deepcopy(NNParam._list_float) + _list_str = copy.deepcopy(NNParam._list_str) + _list_int = copy.deepcopy(NNParam._list_int) + + _int_attr += ["x_dim", "dim_topo"] + _list_str += ["list_attr_obs_tau", "list_attr_obs_x", "list_attr_obs_input_q", + "list_attr_obs_gm_out"] + _list_float += ["tau_adds", "tau_mults", "x_adds", "x_mults", + "input_q_adds", "input_q_mults", + "gm_out_adds", "gm_out_mults"] + _list_int += ["tau_dims", "x_dims", "gm_out_dims", "input_q_dims", + "sizes_enc", "sizes_main", "sizes_out_gm", "sizes_Qnet"] + nn_class = LeapNetEncoded_NN + + def __init__(self, + action_size, + observation_size, # not used here for retro compatibility with NNParam.from_dict + sizes, + activs, + x_dim, + + list_attr_obs, + list_attr_obs_tau, + list_attr_obs_x, + list_attr_obs_input_q, + list_attr_obs_gm_out, + + dim_topo, + + sizes_enc=(20, 20, 20), + sizes_main=(150, 150, 150), + sizes_out_gm=(100, 40), + sizes_Qnet=(100, 100, 100), + + input_q_adds=None, + input_q_mults=None, + gm_out_adds=None, + gm_out_mults=None, + tau_adds=None, + tau_mults=None, + x_adds=None, + x_mults=None, + + tau_dims=None, + x_dims=None, + gm_out_dims=None, + input_q_dims=None, + ): + NNParam.__init__(self, + action_size, + observation_size=0, # not used + sizes=sizes, + activs=activs, + list_attr_obs=list_attr_obs + ) + + self.x_dim = x_dim + + self.list_attr_obs_tau = [str(el) for el in list_attr_obs_tau] + self._define_adds_mults(tau_adds, "tau_adds", list_attr_obs_tau, 0.) + self._define_adds_mults(tau_mults, "tau_mults", list_attr_obs_tau, 1.) + + self.list_attr_obs_x = [str(el) for el in list_attr_obs_x] + self._define_adds_mults(x_adds, "x_adds", list_attr_obs_x, 0.) + self._define_adds_mults(x_mults, "x_mults", list_attr_obs_x, 1.) + + self.list_attr_obs_input_q = [str(el) for el in list_attr_obs_input_q] + self._define_adds_mults(input_q_adds, "input_q_adds", list_attr_obs_input_q, 0.) + self._define_adds_mults(input_q_mults, "input_q_mults", list_attr_obs_input_q, 1.) + + self.list_attr_obs_gm_out = [str(el) for el in list_attr_obs_gm_out] + self._define_adds_mults(gm_out_adds, "gm_out_adds", list_attr_obs_gm_out, 0.) + self._define_adds_mults(gm_out_mults, "gm_out_mults", list_attr_obs_gm_out, 1.) + + # sizes of the neural network "blccks" + self.sizes_enc = sizes_enc + self.sizes_main = sizes_main + self.sizes_out_gm = sizes_out_gm + self.sizes_Qnet = sizes_Qnet + + # dimension of the topogly and number of powerline + self.dim_topo = dim_topo + + # dimension of the space (can be computed in the self.compute_dims) + self.input_q_dims = input_q_dims + self.gm_out_dims = gm_out_dims + self.x_dims = x_dims + self.tau_dims = tau_dims + + def get_obs_attr(self): + res = self.list_attr_obs_x + self.list_attr_obs_input_q + res += self.list_attr_obs_tau + ["topo_vect"] + self.list_attr_obs_gm_out + return res + + def compute_dims(self, env): + self.tau_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_tau] + self.x_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_x] + self.gm_out_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_gm_out] + self.input_q_dims = [int(LeapNetEncoded_NNParam.get_obs_size(env, [el])) for el in self.list_attr_obs_input_q] + + def _define_adds_mults(self, vector, varname, attr_composed, default_val): + if vector is None: + vector = [float(default_val) for _ in attr_composed] + setattr(self, varname, vector) + + def center_reduce(self, env): + self._center_reduce_vect(env.get_obs(), "x") + self._center_reduce_vect(env.get_obs(), "tau") + self._center_reduce_vect(env.get_obs(), "gm_out") + self._center_reduce_vect(env.get_obs(), "input_q") diff --git a/l2rpn_baselines/LeapNetEncoded/__init__.py b/l2rpn_baselines/LeapNetEncoded/__init__.py new file mode 100644 index 0000000..c801db8 --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/__init__.py @@ -0,0 +1,11 @@ +__all__ = [ + "LeapNetEncoded", + "evaluate", + "train", + "LeapNetEncoded_NN" +] + +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded +from l2rpn_baselines.LeapNetEncoded.evaluate import evaluate +from l2rpn_baselines.LeapNetEncoded.train import train +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN diff --git a/l2rpn_baselines/LeapNetEncoded/evaluate.py b/l2rpn_baselines/LeapNetEncoded/evaluate.py new file mode 100644 index 0000000..0d95e8f --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/evaluate.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import os +import tensorflow as tf + +from grid2op.MakeEnv import make +from grid2op.Runner import Runner +from grid2op.Reward import * +from grid2op.Action import * + +from l2rpn_baselines.utils.save_log_gif import save_log_gif +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN + +import pdb + +DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline" +DEFAULT_NB_EPISODE = 1 +DEFAULT_NB_PROCESS = 1 +DEFAULT_MAX_STEPS = -1 + + +def evaluate(env, + name=DEFAULT_NAME, + load_path=None, + logs_path=DEFAULT_LOGS_DIR, + nb_episode=DEFAULT_NB_EPISODE, + nb_process=DEFAULT_NB_PROCESS, + max_steps=DEFAULT_MAX_STEPS, + verbose=False, + save_gif=False): + """ + How to evaluate the performances of the trained DeepQSimple agent. + + Parameters + ---------- + env: :class:`grid2op.Environment` + The environment on which you evaluate your agent. + + name: ``str`` + The name of the trained baseline + + load_path: ``str`` + Path where the agent has been stored + + logs_path: ``str`` + Where to write the results of the assessment + + nb_episode: ``str`` + How many episodes to run during the assessment of the performances + + nb_process: ``int`` + On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be + unstable on some plaform) + + max_steps: ``int`` + How many steps at maximum your agent will be assessed + + verbose: ``bool`` + Currently un used + + save_gif: ``bool`` + Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might + take a lot of ram) and drastically increase computation time. + + Returns + ------- + agent: :class:`l2rpn_baselines.utils.DeepQAgent` + The loaded agent that has been evaluated thanks to the runner. + + res: ``list`` + The results of the Runner on which the agent was tested. + + + Examples + ------- + You can evaluate a DeepQSimple this way: + + .. code-block:: python + + from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward + from l2rpn_baselines.LeapNetEncoded import eval + + # Create dataset env + env = make("l2rpn_case14_sandbox", + reward_class=L2RPNSandBoxScore, + other_rewards={ + "reward": L2RPNReward + }) + + # Call evaluation interface + evaluate(env, + name="MyAwesomeAgent", + load_path="/WHERE/I/SAVED/THE/MODEL", + logs_path=None, + nb_episode=10, + nb_process=1, + max_steps=-1, + verbose=False, + save_gif=False) + + + """ + + # Limit gpu usage + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices): + tf.config.experimental.set_memory_growth(physical_devices[0], True) + + runner_params = env.get_params_for_runner() + runner_params["verbose"] = verbose + + if load_path is None: + raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.") + path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name) + nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json")) + + # Run + # Create agent + agent = LeapNetEncoded(action_space=env.action_space, + name=name, + store_action=nb_process == 1, + nn_archi=nn_archi, + observation_space=env.observation_space) + + # Load weights from file + agent.load(load_path) + + # Build runner + runner = Runner(**runner_params, + agentClass=None, + agentInstance=agent) + + # Print model summary + stringlist = [] + agent.deep_q._model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + if verbose: + print(short_model_summary) + + # Run + os.makedirs(logs_path, exist_ok=True) + res = runner.run(path_save=logs_path, + nb_episode=nb_episode, + nb_process=nb_process, + max_iter=max_steps, + pbar=verbose) + + # Print summary + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal score: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) + print(msg_tmp) + + if len(agent.dict_action): + # I output some of the actions played + print("The agent played {} different action".format(len(agent.dict_action))) + for id_, (nb, act, types) in agent.dict_action.items(): + print("Action with ID {} was played {} times".format(id_, nb)) + print("{}".format(act)) + print("-----------") + + # if logs_path is not None: + # for path_dhron, chron_name, cum_reward, nb_time_step, max_ts in res: + # ep_data = EpisodeData.from_disk(logs_path, chron_name) + + if save_gif: + if verbose: + print("Saving the gif of the episodes") + save_log_gif(logs_path, res) + + return agent, res + + +if __name__ == "__main__": + from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward + from l2rpn_baselines.utils import cli_eval + + # Parse command line + args = cli_eval().parse_args() + + # Create dataset env + env = make(args.env_name, + reward_class=L2RPNSandBoxScore, + other_rewards={ + "reward": L2RPNReward + }) + + # Call evaluation interface + evaluate(env, + name=args.name, + load_path=os.path.abspath(args.load_path), + logs_path=args.logs_dir, + nb_episode=args.nb_episode, + nb_process=args.nb_process, + max_steps=args.max_steps, + verbose=args.verbose, + save_gif=args.save_gif) diff --git a/l2rpn_baselines/LeapNetEncoded/study.py b/l2rpn_baselines/LeapNetEncoded/study.py new file mode 100644 index 0000000..fe7a958 --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/study.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import os +import tensorflow as tf +import numpy as np +from tqdm import tqdm + +from grid2op.MakeEnv import make +from grid2op.Reward import * +from grid2op.Action import * + +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN + +import pdb + +DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline" +DEFAULT_NB_EPISODE = 1 +DEFAULT_NB_PROCESS = 1 +DEFAULT_MAX_STEPS = -1 + + +def study(env, + name=DEFAULT_NAME, + load_path=None, + logs_path=DEFAULT_LOGS_DIR, + nb_episode=DEFAULT_NB_EPISODE, + nb_process=DEFAULT_NB_PROCESS, + max_steps=DEFAULT_MAX_STEPS, + verbose=False, + save_gif=False): + """study the prediction of the grid_model""" + + # Limit gpu usage + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices): + tf.config.experimental.set_memory_growth(physical_devices[0], True) + + runner_params = env.get_params_for_runner() + runner_params["verbose"] = verbose + + if load_path is None: + raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.") + path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name) + nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json")) + + # Run + # Create agent + agent = LeapNetEncoded(action_space=env.action_space, + name=name, + store_action=nb_process == 1, + nn_archi=nn_archi, + observation_space=env.observation_space) + + # Load weights from file + agent.load(load_path) + + # Print model summary + stringlist = [] + agent.deep_q._model.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + if verbose: + print(short_model_summary) + + from grid2op.Agent import RandomAgent + from grid2op.Agent import DoNothingAgent + policy_agent = DoNothingAgent(env.action_space) + policy_agent.seed(0) + + env.set_id(0) + res = {k: ([], []) for k in nn_archi.list_attr_obs_gm_out} + with tqdm(desc="step") as pbar: + for i in range(nb_episode): + obs = env.reset() + reward = env.reward_range[0] + done = False + while not done: + obs_converted = agent.convert_obs(obs) + data_nn, true_output_grid = agent.deep_q._make_x_tau(obs_converted) + + for i, (var_n, add, mult) in enumerate(zip(nn_archi.list_attr_obs_gm_out, + nn_archi.gm_out_adds, + nn_archi.gm_out_mults)): + tmp = true_output_grid[i] + tmp = tmp / mult - add + true_output_grid[i] = tmp + + pred = agent.deep_q.grid_model.predict(data_nn, batch_size=1) + real_pred = [] + for i, (var_n, add, mult) in enumerate(zip(nn_archi.list_attr_obs_gm_out, + nn_archi.gm_out_adds, + nn_archi.gm_out_mults)): + tmp = pred[i] + tmp = tmp / mult - add + real_pred.append(tmp) + + for i, var_n in enumerate(nn_archi.list_attr_obs_gm_out): + res[var_n][0].append(real_pred[i].reshape(-1)) + res[var_n][1].append(true_output_grid[i].reshape(-1)) + + obs, reward, done, info = env.step(policy_agent.act(obs, reward, done)) + pbar.update(1) + + print("Results") + from sklearn.metrics import mean_squared_error + for var_n, (pred, true) in res.items(): + true = np.array(true) + pred = np.array(pred) + RMSE = mean_squared_error(y_true=true, y_pred=pred, multioutput="raw_values", squared=False) + print("RMSE for {}: {:.2f} % variance".format(var_n, 100. * np.mean(RMSE / np.std(true)))) + return agent + + +if __name__ == "__main__": + from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward + from l2rpn_baselines.utils import cli_eval + from grid2op.Parameters import Parameters + + # Parse command line + args = cli_eval().parse_args() + + # Create dataset env + param = Parameters() + param.NO_OVERFLOW_DISCONNECTION = True + env = make(args.env_name, + reward_class=L2RPNSandBoxScore, + other_rewards={ + "reward": L2RPNReward + }, + param=param) + + # Call evaluation interface + study(env, + name=args.name, + load_path=os.path.abspath(args.load_path), + logs_path=args.logs_dir, + nb_episode=args.nb_episode, + nb_process=args.nb_process, + max_steps=args.max_steps, + verbose=args.verbose, + save_gif=args.save_gif) diff --git a/l2rpn_baselines/LeapNetEncoded/train.py b/l2rpn_baselines/LeapNetEncoded/train.py new file mode 100755 index 0000000..5d74e13 --- /dev/null +++ b/l2rpn_baselines/LeapNetEncoded/train.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import os +import warnings +import tensorflow as tf + +from l2rpn_baselines.utils import cli_train +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded import LeapNetEncoded, DEFAULT_NAME +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NN import LeapNetEncoded_NN +from l2rpn_baselines.utils import TrainingParam +from l2rpn_baselines.LeapNetEncoded.LeapNetEncoded_NNParam import LeapNetEncoded_NNParam +from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY + + +def train(env, + name=DEFAULT_NAME, + iterations=1, + save_path=None, + load_path=None, + logs_dir=None, + training_param=None, + filter_action_fun=None, + verbose=True, + kwargs_converters={}, + kwargs_archi={}): + """ + This function implements the "training" part of the baselines "SAC". This is the "old" implementation + that most likely had bugs. We keep it here for backward compatibility, but it is not recommended to + use it on new projects. + + Parameters + ---------- + env: :class:`grid2op.Environment` + Then environment on which you need to train your agent. + + name: ``str``` + The name of your agent. + + iterations: ``int`` + For how many iterations (steps) do you want to train your agent. NB these are not episode, these are steps. + + save_path: ``str`` + Where do you want to save your baseline. + + load_path: ``str`` + If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded + some of the argument provided to this function will not be used. + + logs_dir: ``str`` + Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them. + + training_param: :class:`l2rpn_baselines.utils.TrainingParam` + The parameters describing the way you will train your model. + + filter_action_fun: ``function`` + A function to filter the action space. See + `IdToAct.filter_action `_ + documentation. + + verbose: ``bool`` + If you want something to be printed on the terminal (a better logging strategy will be put at some point) + + kwargs_converters: ``dict`` + A dictionary containing the key-word arguments pass at this initialization of the + :class:`grid2op.Converter.IdToAct` that serves as "Base" for the Agent. + + kwargs_archi: ``dict`` + Key word arguments used for making the :class:`DeepQ_NNParam` object that will be used to build the baseline. + + Returns + ------- + + baseline: :class:`DuelQLeapNet` + The trained baseline. + + + .. _Example-leapnetenc: + + Examples + --------- + Here is an example on how to train a DuelQLeapNet baseline. + + First define a python script, for example + + .. code-block:: python + + import grid2op + from grid2op.Reward import L2RPNReward + from l2rpn_baselines.utils import TrainingParam + from l2rpn_baselines.LeapNetEncoded import train + + # define the environment + env = grid2op.make("l2rpn_case14_sandbox", + reward_class=L2RPNReward) + + # use the default training parameters + tp = TrainingParam() + + # nn architecture + li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"] + li_attr_obs_input_q = ["time_before_cooldown_line", + "time_before_cooldown_sub", + "actual_dispatch", + "target_dispatch", + "day_of_week", + "hour_of_day", + "minute_of_hour", + "rho"] + li_attr_obs_Tau = ["line_status", "timestep_overflow"] + list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X + + kwargs_archi = {'sizes': [], + 'activs': [], + 'x_dim': -1, + + "list_attr_obs": li_attr_obs_X, + "list_attr_obs_tau": li_attr_obs_Tau, + "list_attr_obs_x": li_attr_obs_X, + "list_attr_obs_input_q": li_attr_obs_input_q, + "list_attr_obs_gm_out": list_attr_gm_out, + + 'dim_topo': env.dim_topo, + + "sizes_enc": (50, 50, 50, 50), + "sizes_main": (300, 300, 300), + "sizes_out_gm": (100, ), + "sizes_Qnet": (200, 200, 200) + } + + nm_ = args.name if args.name is not None else DEFAULT_NAME + try: + train(env, + name=nm_, + iterations=args.num_train_steps, + save_path=args.save_path, + load_path=args.load_path, + logs_dir=args.logs_dir, + training_param=tp, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi, + verbose=True) + finally: + env.close() + + """ + + # Limit gpu usage + try: + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + except AttributeError: + # issue of https://stackoverflow.com/questions/59266150/attributeerror-module-tensorflow-core-api-v2-config-has-no-attribute-list-p + try: + physical_devices = tf.config.experimental.list_physical_devices('GPU') + if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + except Exception: + warnings.warn(_WARN_GPU_MEMORY) + except Exception: + warnings.warn(_WARN_GPU_MEMORY) + + if training_param is None: + training_param = TrainingParam() + + # get the size of the action space + kwargs_archi["action_size"] = LeapNetEncoded.get_action_size(env.action_space, filter_action_fun, kwargs_converters) + kwargs_archi["observation_size"] = 0 # this is not used anyway + if load_path is not None: + # TODO test that + path_model, path_target_model = LeapNetEncoded_NN.get_path_model(load_path, name) + print("INFO: Reloading a model, the architecture parameters will be ignored") + nn_archi = LeapNetEncoded_NNParam.from_json(os.path.join(path_model, "nn_architecture.json")) + else: + nn_archi = LeapNetEncoded_NNParam(**kwargs_archi) + # because i was lazy enough not to copy paste all the dimensions there + nn_archi.compute_dims(env) + # because i want data approximately reduced (for the learning process to be smoother) + nn_archi.center_reduce(env) + + baseline = LeapNetEncoded(action_space=env.action_space, + nn_archi=nn_archi, + name=name, + istraining=True, + filter_action_fun=filter_action_fun, + verbose=verbose, + **kwargs_converters + ) + + if load_path is not None: + print("INFO: Reloading a model, training parameters will be ignored") + baseline.load(load_path) + training_param = baseline._training_param + + baseline.train(env, + iterations, + save_path=save_path, + logdir=logs_dir, + training_param=training_param) + # as in our example (and in our explanation) we recommend to save the mode regurlarly in the "train" function + # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than + # recommended to save the "baseline" at the end of this function with: + # baseline.save(path_save) + + +if __name__ == "__main__": + # import grid2op + import numpy as np + from grid2op.Parameters import Parameters + from grid2op import make + from grid2op.Reward import BaseReward + from grid2op.dtypes import dt_float + import re + try: + from lightsim2grid.LightSimBackend import LightSimBackend + backend = LightSimBackend() + except: + from grid2op.Backend import PandaPowerBackend + backend = PandaPowerBackend() + + args = cli_train().parse_args() + + # is it highly recommended to modify the reward depening on the algorithm. + # for example here i will push my algorithm to learn that plyaing illegal or ambiguous action is bad + class MyReward(BaseReward): + power_rho = int(4) # to which "power" is put the rho values + + penalty_powerline_disco = 1.0 # how to penalize the powerline disconnected that can be reconnected + + # how to penalize the fact that a powerline will be disconnected next time steps, because it's close to + # an overflow + penalty_powerline_close_disco = 1.0 + + # cap the minimum reward (put None to ignore) + cap_min = -0.5 # if the minimum reward is too low, model will not learn easily. It will be "scared" to take + # actions. Because you win more or less points 1 by 1, but you can lose them + # way way faster. + + def __init__(self): + self.reward_min = 0 + self.reward_max = 0 + self.ts_overflow = None + + def initialize(self, env): + self.ts_overflow = env.parameters.NB_TIMESTEP_OVERFLOW_ALLOWED-1 + # now calibrate min and max reward + hard_overflow = env.parameters.HARD_OVERFLOW_THRESHOLD + max_flow_penalty = self.flow_penalty(rho=np.ones(env.n_line) * hard_overflow) / env.n_line + disconnected_powerline_that_can_be_reconnected = self.penalty_powerline_disco + disconnected_still_connected_powerline_on_overflow = self.penalty_powerline_close_disco + self.reward_min = max_flow_penalty - disconnected_powerline_that_can_be_reconnected + self.reward_min -= disconnected_still_connected_powerline_on_overflow + if self.cap_min is not None: + self.reward_min = max(self.reward_min, self.cap_min) + self.reward_max = 1.0 + + def flow_penalty(self, rho): + tmp = 1 - rho**self.power_rho + return tmp.sum() + + def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous): + if has_error or is_ambiguous: + # previous action was bad + res = self.reward_min #self.reward_min + elif is_done: + # really strong reward if an episode is over without game over + res = self.reward_max + else: + if env.get_obs() is not None: + obs = env.get_obs() + res = self.flow_penalty(rho=obs.rho) + disconnected_powerline_that_can_be_reconnected = np.sum((obs.time_before_cooldown_line == 0) & + (~obs.line_status)) + disconnected_still_connected_powerline_on_overflow = np.sum((obs.timestep_overflow == self.ts_overflow) & + (obs.rho >= 1.)) + res -= disconnected_powerline_that_can_be_reconnected * self.penalty_powerline_disco + res -= disconnected_still_connected_powerline_on_overflow * self.penalty_powerline_close_disco + else: + res = env.n_line + res /= env.n_line + if is_illegal: + if res > 0.: + res *= 0.1 # divide by 10 reward for illegal actions + else: + res *= 10. + if not np.isfinite(res): + res = self.reward_min + + if self.cap_min is not None: + res = max(res, self.cap_min) + return dt_float(res) + + # Use custom params + + # Create grid2op game environement + env_init = None + from grid2op.Chronics import MultifolderWithCache + game_param = Parameters() + game_param.NB_TIMESTEP_COOLDOWN_SUB = 2 + game_param.NB_TIMESTEP_COOLDOWN_LINE = 2 + env = make(args.env_name, + param=game_param, + reward_class=MyReward, + backend=backend, + chronics_class=MultifolderWithCache + ) + + if env.name == "l2rpn_wcci_2020": + env.chronics_handler.real_data.set_filter(lambda x: re.match(".*Scenario_february_.*$", x) is not None) + env.chronics_handler.real_data.reset() + elif env.name == "l2rpn_case14_sandbox": + # all data can be loaded into memory + # env.chronics_handler.real_data.set_filter(lambda x: True) + env.chronics_handler.real_data.reset() + + # env.chronics_handler.real_data. + env_init = env + if args.nb_env > 1: + from l2rpn_baselines.utils import make_multi_env + env = make_multi_env(env_init=env_init, nb_env=int(args.nb_env)) + + tp = TrainingParam() + # NN training + tp.lr = 1e-5 + tp.lr_decay_steps = 300000 + tp.minibatch_size = 32 * int(args.nb_env) + tp.update_freq = tp.minibatch_size / 2 + + # limit the number of time steps played per scenarios + tp.step_increase_nb_iter = None # None to deactivate it + tp.min_iter = None + tp.update_nb_iter = None # once 100 scenarios are solved, increase of "step_increase_nb_iter" + + # oversampling hard scenarios + tp.oversampling_rate = None # None to deactivate it + + # experience replay + tp.buffer_size = 1000000 + + # just observe the data for a while + tp.min_observe = None # int(10000) + + # e greedy + tp.min_observation = 128 + tp.initial_epsilon = 0.2 + tp.final_epsilon = 1./(288.) + tp.step_for_final_epsilon = int(1e5) + # TODO add the "i dont do anything for a few time steps at the beginning of the training" + + # don't start always at the same hour (if not None) otherwise random sampling, see docs + tp.random_sample_datetime_start = None + + # saving, logging etc. + tp.save_model_each = 10000 + tp.update_tensorboard_freq = 256 + + # which actions i keep + if env.name == "l2rpn_case14_sandbox": + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_line_status": True, + "change_bus_vect": True, + "set_topo_vect": False, + "redispacth": False + } + else: + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_line_status": True, + "change_bus_vect": False, + "set_topo_vect": False, + "redispacth": False + } + + # nn architecture + li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"] + li_attr_obs_input_q = ["time_before_cooldown_line", + "time_before_cooldown_sub", + "actual_dispatch", + "target_dispatch", + "day_of_week", + "hour_of_day", + "minute_of_hour", + "rho"] + li_attr_obs_Tau = ["line_status", "timestep_overflow"] + list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X + + kwargs_archi = {'sizes': [], + 'activs': [], + 'x_dim': -1, + + "list_attr_obs": li_attr_obs_X, + "list_attr_obs_tau": li_attr_obs_Tau, + "list_attr_obs_x": li_attr_obs_X, + "list_attr_obs_input_q": li_attr_obs_input_q, + "list_attr_obs_gm_out": list_attr_gm_out, + + 'dim_topo': env_init.dim_topo, + + "sizes_enc": (50, 50, 50, 50), + "sizes_main": (300, 300, 300), + "sizes_out_gm": (100, ), + "sizes_Qnet": (200, 200, 200) + } + + nm_ = args.name if args.name is not None else DEFAULT_NAME + # python3 train.py --env_name="l2rpn_wcci_2020" --save_path="model_saved" --logs_dir="tf_logs" --num_train_steps=10000 --name="InitialTest4 + try: + train(env, + name=nm_, + iterations=args.num_train_steps, + save_path=args.save_path, + load_path=args.load_path, + logs_dir=args.logs_dir, + training_param=tp, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi, + verbose=True) + finally: + env.close() + if args.nb_env > 1: + env_init.close() diff --git a/l2rpn_baselines/SAC/SAC_NN.py b/l2rpn_baselines/SAC/SAC_NN.py index 0a40967..172fb16 100644 --- a/l2rpn_baselines/SAC/SAC_NN.py +++ b/l2rpn_baselines/SAC/SAC_NN.py @@ -35,6 +35,13 @@ class SAC_NN(BaseDeepQ): However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom parameters class (in this calse :class:`SAC_NNParam` is flexible enough to meet our needs. + References + ----------- + Original paper: + https://arxiv.org/abs/1801.01290 + + modified for discrete action space: + https://arxiv.org/abs/1910.07207 """ def __init__(self, nn_params, @@ -60,7 +67,6 @@ def __init__(self, self.model_Q2 = None self.model_policy = None - self.construct_q_network() self.previous_size = 0 self.previous_eyes = None self.previous_arange = None @@ -77,6 +83,8 @@ def __init__(self, self.schedule_lr_value = None self.optimizer_value = None + self.construct_q_network() + def _build_q_NN(self): input_states = Input(shape=(self._observation_size,)) input_action = Input(shape=(self._action_size,)) @@ -147,19 +155,20 @@ def _get_eye_pm(self, batch_size): self.previous_size = batch_size return self.previous_eyes, self.previous_arange - def predict_movement(self, data, epsilon, batch_size=None): + def predict_movement(self, data, epsilon, batch_size=None, training=False): """ predict the next movements in a vectorized fashion """ if batch_size is None: batch_size = data.shape[0] rand_val = np.random.random(data.shape[0]) - p_actions = self.model_policy.predict(data, batch_size=batch_size) + p_actions = self.model_policy(data, training=training).numpy() opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1) opt_policy = 1.0 * opt_policy_orig opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon))) opt_policy = opt_policy.astype(np.int) - return opt_policy, p_actions[:, opt_policy] + idx = np.arange(batch_size) + return opt_policy, p_actions[idx, opt_policy], p_actions def _get_eye_train(self, batch_size): if batch_size != self.previous_size_train: @@ -175,18 +184,25 @@ def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, ba if batch_size is None: batch_size = s_batch.shape[0] target = np.zeros((batch_size, 1)) + # training of the action state value networks last_action = np.zeros((batch_size, self._action_size)) + # Save the graph just the first time if tf_writer is not None: tf.summary.trace_on() - fut_action = self.model_value_target.predict(s2_batch, batch_size=batch_size).reshape(-1) + # TODO is it s2 or s ? For me it should be s... + fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1) + # TODO ***_target should be for the Q function instead imho + if tf_writer is not None: with tf_writer.as_default(): tf.summary.trace_export("model_value_target-graph", 0) tf.summary.trace_off() + # TODO is it rather `targets[:, a_batch]` target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action + # target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action loss = self.model_Q.train_on_batch([s_batch, last_action], target) loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target) diff --git a/l2rpn_baselines/SACOld/SACOld.py b/l2rpn_baselines/SACOld/SACOld.py new file mode 100644 index 0000000..ab5ad39 --- /dev/null +++ b/l2rpn_baselines/SACOld/SACOld.py @@ -0,0 +1,21 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +from l2rpn_baselines.utils import DeepQAgent +from l2rpn_baselines.SAC.SAC_NN import SAC_NN +DEFAULT_NAME = "SACOld" + + +class SACOld(DeepQAgent): + """ + This is the :class:`l2rpn_baselines.utils` agent representing the SAC agent (old implementation). + + Please don't use this baseline if you start a new project, prefer using the new, double check + SAC implementation instead (:class:`l2rpn_baselines.SAC.SAC`) instead. + """ + pass diff --git a/l2rpn_baselines/SACOld/SACOld_NN.py b/l2rpn_baselines/SACOld/SACOld_NN.py new file mode 100644 index 0000000..762d761 --- /dev/null +++ b/l2rpn_baselines/SACOld/SACOld_NN.py @@ -0,0 +1,283 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import numpy as np +import os +import tensorflow as tf + +# tf2.0 friendly +import warnings + +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + from tensorflow.keras.models import load_model, Sequential, Model + from tensorflow.keras.layers import Activation, Dense + from tensorflow.keras.layers import Input, Concatenate + +from l2rpn_baselines.utils import BaseDeepQ, TrainingParam + + +# This class implements the "Sof Actor Critic" model. +# It is a custom implementation, courtesy to Clement Goubet +# The original paper is: https://arxiv.org/abs/1801.01290 +class SACOld_NN(BaseDeepQ): + """ + Constructs the desired soft actor critic network. + + Compared to other baselines shown elsewhere (*eg* :class:`l2rpn_baselines.DeepQSimple` or + :class:`l2rpn_baselines.DeepQSimple`) the implementation of the SAC is a bit more tricky + (and was most likely NOT done properly in this class). For a more correct implementation + of SAC please look at the :class:`l2rpn_baselines.SAC.SAC` instead. This class is only + present for backward compatibility. + + However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom + parameters class (in this case :class:`SACOld_NNParam` is flexible enough to meet our needs. + + References + ----------- + Original paper: + https://arxiv.org/abs/1801.01290 + + modified for discrete action space: + https://arxiv.org/abs/1910.07207 + """ + def __init__(self, + nn_params, + training_param=None, + verbose=False): + if training_param is None: + training_param = TrainingParam() + BaseDeepQ.__init__(self, + nn_params, + training_param, + verbose=verbose) + + # TODO add as meta param the number of "Q" you want to use (here 2) + # TODO add as meta param size and types of the networks + self.average_reward = 0 + self.life_spent = 1 + self.qvalue_evolution = np.zeros((0,)) + self.Is_nan = False + + self.model_value_target = None + self.model_value = None + self.model_Q = None + self.model_Q2 = None + self.model_policy = None + + self.previous_size = 0 + self.previous_eyes = None + self.previous_arange = None + self.previous_size_train = 0 + self.previous_eyes_train = None + + # optimizers and learning rate + self.schedule_lr_policy = None + self.optimizer_policy = None + self.schedule_lr_Q = None + self.optimizer_Q = None + self.schedule_lr_Q2 = None + self.optimizer_Q2 = None + self.schedule_lr_value = None + self.optimizer_value = None + + self.construct_q_network() + + def _build_q_NN(self): + input_states = Input(shape=(self._observation_size,)) + input_action = Input(shape=(self._action_size,)) + + input_layer = Concatenate()([input_states, input_action]) + lay = input_layer + for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes, self._nn_archi.activs)): + lay = Dense(size, name="layer_{}".format(lay_num))(lay) # put at self.action_size + lay = Activation(act)(lay) + + advantage = Dense(1, activation='linear')(lay) + + model = Model(inputs=[input_states, input_action], outputs=[advantage]) + return model + + def _build_model_value(self): + input_states = Input(shape=(self._observation_size,)) + + lay = input_states + for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_value, self._nn_archi.activs_value)): + lay = Dense(size)(lay) + lay = Activation(act)(lay) + + advantage = Dense(self._action_size, activation='relu')(lay) + state_value = Dense(1, activation='linear', name="state_value")(advantage) + model = Model(inputs=[input_states], outputs=[state_value]) + return model + + def construct_q_network(self): + """ + This constructs all the networks needed for the SAC agent. + """ + self.model_Q = self._build_q_NN() + self.schedule_lr_Q, self.optimizer_Q = self.make_optimiser() + self.model_Q.compile(loss='mse', optimizer=self.optimizer_Q) + + self.model_Q2 = self._build_q_NN() + self.schedule_lr_Q2, self.optimizer_Q2 = self.make_optimiser() + self.model_Q2.compile(loss='mse', optimizer=self.optimizer_Q2) + + # state value function approximation + self.model_value = self._build_model_value() + self.schedule_lr_value, self.optimizer_value = self.make_optimiser() + self._optimizer_model = self.optimizer_value + self.model_value.compile(loss='mse', optimizer=self.optimizer_value) + + self.model_value_target = self._build_model_value() + self.model_value_target.set_weights(self.model_value.get_weights()) + + # policy function approximation + self.model_policy = Sequential() + # proba of choosing action a depending on policy pi + input_states = Input(shape=(self._observation_size,)) + lay = input_states + for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_policy, self._nn_archi.activs_policy)): + lay = Dense(size)(lay) + lay = Activation(act)(lay) + soft_proba = Dense(self._action_size, activation="softmax", kernel_initializer='uniform', name="soft_proba")(lay) + self.model_policy = Model(inputs=[input_states], outputs=[soft_proba]) + self.schedule_lr_policy, self.optimizer_policy = self.make_optimiser() + self.model_policy.compile(loss='categorical_crossentropy', optimizer=self.optimizer_policy) + + def _get_eye_pm(self, batch_size): + if batch_size != self.previous_size: + tmp = np.zeros((batch_size, self._action_size), dtype=np.float32) + self.previous_eyes = tmp + self.previous_arange = np.arange(batch_size) + self.previous_size = batch_size + return self.previous_eyes, self.previous_arange + + def predict_movement(self, data, epsilon, batch_size=None, training=False): + """ + predict the next movements in a vectorized fashion + """ + if batch_size is None: + batch_size = data.shape[0] + rand_val = np.random.random(data.shape[0]) + p_actions = self.model_policy(data, training=training).numpy() + opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1) + opt_policy = 1.0 * opt_policy_orig + opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon))) + opt_policy = opt_policy.astype(np.int) + return opt_policy, p_actions[:, opt_policy], p_actions + + def _get_eye_train(self, batch_size): + if batch_size != self.previous_size_train: + self.previous_eyes_train = np.repeat(np.eye(self._action_size), + batch_size * np.ones(self._action_size, dtype=np.int), + axis=0) + self.previous_eyes_train = tf.convert_to_tensor(self.previous_eyes_train, dtype=tf.float32) + self.previous_size_train = batch_size + return self.previous_eyes_train + + def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None): + """Trains networks to fit given parameters""" + if batch_size is None: + batch_size = s_batch.shape[0] + target = np.zeros((batch_size, 1)) + + # training of the action state value networks + last_action = np.zeros((batch_size, self._action_size)) + + # Save the graph just the first time + if tf_writer is not None: + tf.summary.trace_on() + # TODO is it s2 or s ? For me it should be s... + fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1) + # TODO ***_target should be for the Q function instead imho + + if tf_writer is not None: + with tf_writer.as_default(): + tf.summary.trace_export("model_value_target-graph", 0) + tf.summary.trace_off() + + # TODO is it rather `targets[:, a_batch]` + target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action + # target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action + loss = self.model_Q.train_on_batch([s_batch, last_action], target) + loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target) + + self.life_spent += 1 + temp = 1 / np.log(self.life_spent) / 2 + tiled_batch = np.tile(s_batch, (self._action_size, 1)) + tiled_batch_ts = tf.convert_to_tensor(tiled_batch) + # tiled_batch: output something like: batch, batch, batch + # TODO save that somewhere not to compute it each time, you can even save this in the + # TODO tensorflow graph! + tmp = self._get_eye_train(batch_size) + + action_v1_orig = self.model_Q.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1) + action_v2_orig = self.model_Q2.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1) + action_v1 = action_v1_orig - np.amax(action_v1_orig, axis=-1).reshape(batch_size, 1) + new_proba = np.exp(action_v1 / temp) / np.sum(np.exp(action_v1 / temp), axis=-1).reshape(batch_size, 1) + new_proba_ts = tf.convert_to_tensor(new_proba) + loss_policy = self.model_policy.train_on_batch(s_batch, new_proba_ts) + + target_pi = self.model_policy.predict(s_batch, batch_size=batch_size) + value_target = np.fmin(action_v1_orig[0, a_batch], action_v2_orig[0, a_batch]) - np.sum( + target_pi * np.log(target_pi + 1e-6)) + value_target_ts = tf.convert_to_tensor(value_target.reshape(-1, 1)) + loss_value = self.model_value.train_on_batch(s_batch, value_target_ts) + + self.Is_nan = np.isnan(loss) + np.isnan(loss_2) + np.isnan(loss_policy) + np.isnan(loss_value) + return np.all(np.isfinite(loss)) & np.all(np.isfinite(loss_2)) & np.all(np.isfinite(loss_policy)) & \ + np.all(np.isfinite(loss_value)) + + @staticmethod + def _get_path_model(path, name=None): + if name is None: + path_model = path + else: + path_model = os.path.join(path, name) + path_target_model = "{}_target".format(path_model) + path_modelQ = "{}_Q".format(path_model) + path_modelQ2 = "{}_Q2".format(path_model) + path_policy = "{}_policy".format(path_model) + return path_model, path_target_model, path_modelQ, path_modelQ2, path_policy + + def save_network(self, path, name=None, ext="h5"): + """ + Saves all the models with unique names + """ + path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name) + self.model_value.save('{}.{}'.format(path_model, ext)) + self.model_value_target.save('{}.{}'.format(path_target_model, ext)) + self.model_Q.save('{}.{}'.format(path_modelQ, ext)) + self.model_Q2.save('{}.{}'.format(path_modelQ2, ext)) + self.model_policy.save('{}.{}'.format(path_policy, ext)) + + def load_network(self, path, name=None, ext="h5"): + """ + We load all the models using the keras "load_model" function. + """ + path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name) + self.construct_q_network() + self.model_value.load_weights('{}.{}'.format(path_model, ext)) + self.model_value_target.load_weights('{}.{}'.format(path_target_model, ext)) + self.model_Q.load_weights('{}.{}'.format(path_modelQ, ext)) + self.model_Q2.load_weights('{}.{}'.format(path_modelQ2, ext)) + self.model_policy.load_weights('{}.{}'.format(path_policy, ext)) + if self.verbose: + print("Succesfully loaded network.") + + def target_train(self): + """ + This update the target model. + """ + model_weights = self.model_value.get_weights() + target_model_weights = self.model_value_target.get_weights() + for i in range(len(model_weights)): + target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \ + target_model_weights[i] + self.model_value_target.set_weights(model_weights) diff --git a/l2rpn_baselines/SACOld/SACOld_NNParam.py b/l2rpn_baselines/SACOld/SACOld_NNParam.py new file mode 100644 index 0000000..2521842 --- /dev/null +++ b/l2rpn_baselines/SACOld/SACOld_NNParam.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. +import copy + +from l2rpn_baselines.utils import NNParam +from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN + + +class SACOld_NNParam(NNParam): + """ + + Do not use this SACOld class, prefer the use of the "more correct" + class :class:`l2rpn_baselines.SAC.SAC` for new projects instead. This module is only here + for backward compatibility. + + Attributes + ---------- + sizes_value: ``list`` + List of integer, each one representing the size of the hidden layer for the "value" neural network. + + activs_value: ``list`` + List of ``str`` for each hidden layer of the "value" neural network, indicates which hidden layer to use + + sizes_policy: ``list`` + List of integers, each reprenseting the size of the hidden layer for the "policy" network. + + activs_policy: ``list`` + List of ``str``: The activation functions (for each layer) of the policy network + + """ + _int_attr = copy.deepcopy(NNParam._int_attr) + _float_attr = copy.deepcopy(NNParam._float_attr) + _str_attr = copy.deepcopy(NNParam._str_attr) + _list_float = copy.deepcopy(NNParam._list_float) + _list_str = copy.deepcopy(NNParam._list_str) + _list_int = copy.deepcopy(NNParam._list_int) + + _list_str += ["activs_value", "activs_policy"] + _list_int += ["sizes_value", "sizes_policy"] + + nn_class = SACOld_NN + + def __init__(self, + action_size, + observation_size, # TODO this might not be usefull + sizes, + activs, + list_attr_obs, + sizes_value, + activs_value, + sizes_policy, + activs_policy + ): + NNParam.__init__(self, + action_size, + observation_size, # TODO this might not be usefull + sizes, + activs, + list_attr_obs + ) + self.sizes_value = sizes_value + self.activs_value = activs_value + self.sizes_policy = sizes_policy + self.activs_policy = activs_policy diff --git a/l2rpn_baselines/SACOld/__init__.py b/l2rpn_baselines/SACOld/__init__.py new file mode 100644 index 0000000..a2ccffb --- /dev/null +++ b/l2rpn_baselines/SACOld/__init__.py @@ -0,0 +1,11 @@ +__all__ = [ + "SACOld", + "evaluate", + "train", + "SACOld_NNParam" +] + +from l2rpn_baselines.SACOld.SACOld import SACOld +from l2rpn_baselines.SACOld.evaluate import evaluate +from l2rpn_baselines.SACOld.train import train +from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam diff --git a/l2rpn_baselines/SACOld/evaluate.py b/l2rpn_baselines/SACOld/evaluate.py new file mode 100644 index 0000000..c4a710d --- /dev/null +++ b/l2rpn_baselines/SACOld/evaluate.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import os +import tensorflow as tf + +from grid2op.MakeEnv import make +from grid2op.Runner import Runner +from grid2op.Reward import * +from grid2op.Action import * + +from l2rpn_baselines.utils.save_log_gif import save_log_gif +from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME +from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam +from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN + +DEFAULT_LOGS_DIR = "./logs-eval/do-nothing-baseline" +DEFAULT_NB_EPISODE = 1 +DEFAULT_NB_PROCESS = 1 +DEFAULT_MAX_STEPS = -1 + + +def evaluate(env, + name=DEFAULT_NAME, + load_path=None, + logs_path=DEFAULT_LOGS_DIR, + nb_episode=DEFAULT_NB_EPISODE, + nb_process=DEFAULT_NB_PROCESS, + max_steps=DEFAULT_MAX_STEPS, + verbose=False, + save_gif=False): + """ + How to evaluate the performances of the trained SAC agent (old implementation). + + Please use the new implementation instead. + + Parameters + ---------- + env: :class:`grid2op.Environment` + The environment on which you evaluate your agent. + + name: ``str`` + The name of the trained baseline + + load_path: ``str`` + Path where the agent has been stored + + logs_path: ``str`` + Where to write the results of the assessment + + nb_episode: ``str`` + How many episodes to run during the assessment of the performances + + nb_process: ``int`` + On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be + unstable on some plaform) + + max_steps: ``int`` + How many steps at maximum your agent will be assessed + + verbose: ``bool`` + Currently un used + + save_gif: ``bool`` + Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might + take a lot of ram) and drastically increase computation time. + + Returns + ------- + agent: :class:`l2rpn_baselines.utils.DeepQAgent` + The loaded agent that has been evaluated thanks to the runner. + + res: ``list`` + The results of the Runner on which the agent was tested. + + + Examples + ------- + You can evaluate a DeepQSimple this way: + + .. code-block:: python + + from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward + from l2rpn_baselines.SACOld import eval + + # Create dataset env + env = make("l2rpn_case14_sandbox", + reward_class=L2RPNSandBoxScore, + other_rewards={ + "reward": L2RPNReward + }) + + # Call evaluation interface + evaluate(env, + name="MyAwesomeAgent", + load_path="/WHERE/I/SAVED/THE/MODEL", + logs_path=None, + nb_episode=10, + nb_process=1, + max_steps=-1, + verbose=False, + save_gif=False) + """ + + # Limit gpu usage + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices): + tf.config.experimental.set_memory_growth(physical_devices[0], True) + + runner_params = env.get_params_for_runner() + runner_params["verbose"] = verbose + + if load_path is None: + raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.") + path_model, path_target_model = SACOld_NN.get_path_model(load_path, name) + nn_archi = SACOld_NNParam.from_json(os.path.join(path_model, "nn_architecture.json")) + + # Run + # Create agent + agent = SACOld(action_space=env.action_space, + name=name, + store_action=nb_process == 1, + nn_archi=nn_archi, + observation_space=env.observation_space) + + # Load weights from file + agent.load(load_path) + + # Print model summary + stringlist = [] + agent.deep_q.model_value.summary(print_fn=lambda x: stringlist.append(x)) + short_model_summary = "\n".join(stringlist) + + if verbose: + print("Value model: {}".format(short_model_summary)) + + # Build runner + runner = Runner(**runner_params, + agentClass=None, + agentInstance=agent) + + # Run + os.makedirs(logs_path, exist_ok=True) + res = runner.run(path_save=logs_path, + nb_episode=nb_episode, + nb_process=nb_process, + max_iter=max_steps, + pbar=verbose) + + # Print summary + + if verbose: + print("Evaluation summary:") + for _, chron_name, cum_reward, nb_time_step, max_ts in res: + msg_tmp = "chronics at: {}".format(chron_name) + msg_tmp += "\ttotal score: {:.6f}".format(cum_reward) + msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts) + print(msg_tmp) + + if len(agent.dict_action): + # I output some of the actions played + print("The agent played {} different action".format(len(agent.dict_action))) + for id_, (nb, act, types) in agent.dict_action.items(): + print("Action with ID {} was played {} times".format(id_, nb)) + print("{}".format(act)) + print("-----------") + + if save_gif: + if verbose: + print("Saving the gif of the episodes") + save_log_gif(logs_path, res) + + return agent, res + + +if __name__ == "__main__": + from grid2op.Reward import L2RPNSandBoxScore, L2RPNReward + from l2rpn_baselines.utils import cli_eval + + # Parse command line + args = cli_eval().parse_args() + + # Create dataset env + env = make(args.env_name, + reward_class=L2RPNSandBoxScore, + other_rewards={ + "reward": L2RPNReward + }) + + # Call evaluation interface + evaluate(env, + name=args.name, + load_path=os.path.abspath(args.load_path), + logs_path=args.logs_dir, + nb_episode=args.nb_episode, + nb_process=args.nb_process, + max_steps=args.max_steps, + verbose=args.verbose, + save_gif=args.save_gif) diff --git a/l2rpn_baselines/SACOld/train.py b/l2rpn_baselines/SACOld/train.py new file mode 100755 index 0000000..48a003c --- /dev/null +++ b/l2rpn_baselines/SACOld/train.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2020, RTE (https://www.rte-france.com) +# See AUTHORS.txt +# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0. +# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file, +# you can obtain one at http://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. + +import os +import tensorflow as tf +import warnings + +from l2rpn_baselines.utils import cli_train +from l2rpn_baselines.SACOld.SACOld import SACOld, DEFAULT_NAME +from l2rpn_baselines.SACOld.SACOld_NNParam import SACOld_NNParam +from l2rpn_baselines.SACOld.SACOld_NN import SACOld_NN +from l2rpn_baselines.utils import TrainingParam +from l2rpn_baselines.utils.waring_msgs import _WARN_GPU_MEMORY + + +def train(env, + name=DEFAULT_NAME, + iterations=1, + save_path=None, + load_path=None, + logs_dir=None, + training_param=None, + filter_action_fun=None, + verbose=True, + kwargs_converters={}, + kwargs_archi={}): + """ + This function implements the "training" part of the baselines "SAC" (old buggy implementation). + Please use the :class:`l2rpn_baselines.SAC.SAC` for new projects. + + Parameters + ---------- + env: :class:`grid2op.Environment` + Then environment on which you need to train your agent. + + name: ``str``` + The name of your agent. + + iterations: ``int`` + For how many iterations (steps) do you want to train your agent. NB these are not episode, these are steps. + + save_path: ``str`` + Where do you want to save your baseline. + + load_path: ``str`` + If you want to reload your baseline, specify the path where it is located. **NB** if a baseline is reloaded + some of the argument provided to this function will not be used. + + logs_dir: ``str`` + Where to store the tensorboard generated logs during the training. ``None`` if you don't want to log them. + + verbose: ``bool`` + If you want something to be printed on the terminal (a better logging strategy will be put at some point) + + training_param: :class:`l2rpn_baselines.utils.TrainingParam` + The parameters describing the way you will train your model. + + filter_action_fun: ``function`` + A function to filter the action space. See + `IdToAct.filter_action `_ + documentation. + + kwargs_converters: ``dict`` + A dictionary containing the key-word arguments pass at this initialization of the + :class:`grid2op.Converter.IdToAct` that serves as "Base" for the Agent. + + kwargs_archi: ``dict`` + Key word arguments used for making the :class:`DeepQ_NNParam` object that will be used to build the baseline. + + Returns + ------- + + baseline: :class:`SACOld` + The trained baseline. + + + .. _Example-sacold: + + Examples + --------- + Here is an example on how to train a SAC baseline. + + First define a python script, for example + + .. code-block:: python + + import grid2op + from grid2op.Reward import L2RPNReward + from l2rpn_baselines.utils import TrainingParam, NNParam + from l2rpn_baselines.SACOld import train + + # define the environment + env = grid2op.make("l2rpn_case14_sandbox", + reward_class=L2RPNReward) + + # use the default training parameters + tp = TrainingParam() + + # this will be the list of what part of the observation I want to keep + # more information on https://grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes + li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q", + "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line", + "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"] + + # neural network architecture + observation_size = NNParam.get_obs_size(env, li_attr_obs_X) + sizes_q = [800, 800, 800, 494, 494, 494] # sizes of each hidden layers + sizes_v = [800, 800] # sizes of each hidden layers + sizes_pol = [800, 800, 800, 494, 494, 494] # sizes of each hidden layers + kwargs_archi = {'observation_size': observation_size, + 'sizes': sizes_q, + 'activs': ["relu" for _ in range(len(sizes_q))], + "list_attr_obs": li_attr_obs_X, + "sizes_value": sizes_v, + "activs_value": ["relu" for _ in range(len(sizes_v))], + "sizes_policy": sizes_pol, + "activs_policy": ["relu" for _ in range(len(sizes_pol))] + } + + # select some part of the action + # more information at https://grid2op.readthedocs.io/en/latest/converter.html#grid2op.Converter.IdToAct.init_converter + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_bus_vect": True, + "set_topo_vect": False + } + # define the name of the model + nm_ = "AnneOnymous" + try: + train(env, + name=nm_, + iterations=10000, + save_path="/WHERE/I/SAVED/THE/MODEL", + load_path=None, + logs_dir="/WHERE/I/SAVED/THE/LOGS", + training_param=tp, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi) + finally: + env.close() + + """ + + # Limit gpu usage + try: + physical_devices = tf.config.list_physical_devices('GPU') + if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + except AttributeError: + # issue of https://stackoverflow.com/questions/59266150/attributeerror-module-tensorflow-core-api-v2-config-has-no-attribute-list-p + try: + physical_devices = tf.config.experimental.list_physical_devices('GPU') + if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + except Exception: + warnings.warn(_WARN_GPU_MEMORY) + except Exception: + warnings.warn(_WARN_GPU_MEMORY) + + if training_param is None: + training_param = TrainingParam() + + # compute the proper size for the converter + kwargs_archi["action_size"] = SACOld.get_action_size(env.action_space, filter_action_fun, kwargs_converters) + + if load_path is not None: + path_model, path_target_model = SACOld_NN.get_path_model(load_path, name) + if verbose: + print("INFO: Reloading a model, the architecture parameters provided will be ignored") + nn_archi = SACOld_NNParam.from_json(os.path.join(path_model, "nn_architecture.json")) + else: + nn_archi = SACOld_NNParam(**kwargs_archi) + + baseline = SACOld(action_space=env.action_space, + nn_archi=nn_archi, + name=name, + istraining=True, + verbose=verbose, + **kwargs_converters + ) + + if load_path is not None: + if verbose: + print("INFO: Reloading a model, training parameters will be ignored") + baseline.load(load_path) + training_param = baseline._training_param + + baseline.train(env, + iterations, + save_path=save_path, + logdir=logs_dir, + training_param=training_param) + # as in our example (and in our explanation) we recommend to save the mode regurlarly in the "train" function + # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than + # recommended to save the "baseline" at the end of this function with: + # baseline.save(path_save) + + +if __name__ == "__main__": + # import grid2op + import numpy as np + from grid2op.Parameters import Parameters + from grid2op import make + from grid2op.Reward import L2RPNReward + import re + try: + from lightsim2grid.LightSimBackend import LightSimBackend + backend = LightSimBackend() + except: + from grid2op.Backend import PandaPowerBackend + backend = PandaPowerBackend() + + args = cli_train().parse_args() + + # is it highly recommended to modify the reward depening on the algorithm. + # for example here i will push my algorithm to learn that plyaing illegal or ambiguous action is bad + class MyReward(L2RPNReward): + def initialize(self, env): + self.reward_min = 0.0 + self.reward_max = 1.0 + + def __call__(self, action, env, has_error, is_done, is_illegal, is_ambiguous): + if has_error or is_illegal or is_ambiguous: + # previous action was bad + res = self.reward_min + elif is_done: + # really strong reward if an episode is over without game over + res = self.reward_max + else: + res = super().__call__(action, env, has_error, is_done, is_illegal, is_ambiguous) + res /= env.n_line + if not np.isfinite(res): + res = self.reward_min + return res + + # Use custom params + + # Create grid2op game environement + env_init = None + try: + from grid2op.Chronics import MultifolderWithCache + except: + from grid2op.Chronics import MultiFolder + MultifolderWithCache = MultiFolder + + game_param = Parameters() + game_param.NB_TIMESTEP_COOLDOWN_SUB = 2 + game_param.NB_TIMESTEP_COOLDOWN_LINE = 2 + env = make(args.env_name, + param=game_param, + reward_class=MyReward, + backend=backend, + chronics_class=MultifolderWithCache + ) + # env.chronics_handler.set_max_iter(7*288) + try: + env.chronics_handler.real_data.set_filter(lambda x: re.match(".*((03)|(72)|(57))$", x) is not None) + env.chronics_handler.real_data.reset() + except RuntimeError as exc_: + raise exc_ + except AttributeError as exc_: + # not available in all grid2op version + pass + # env.chronics_handler.real_data. + env_init = env + if args.nb_env > 1: + from l2rpn_baselines.utils import make_multi_env + env = make_multi_env(env_init=env_init, nb_env=int(args.nb_env)) + + tp = TrainingParam() + + # NN training + tp.lr = 1e-4 + tp.lr_decay_steps = 30000 + tp.minibatch_size = 256 + tp.update_freq = 128 + + # limit the number of time steps played per scenarios + tp.step_increase_nb_iter = 100 # None to deactivate it + tp.min_iter = 10 + tp.update_nb_iter = 100 # once 100 scenarios are solved, increase of "step_increase_nb_iter" + + # oversampling hard scenarios + tp.oversampling_rate = 3 + + # experience replay + tp.buffer_size = 1000000 + + # e greedy + tp.min_observation = 10000 + tp.initial_epsilon = 0.4 + tp.final_epsilon = 1./(2*7*288.) + tp.step_for_final_epsilon = int(1e5) + + # don't start always at the same hour (if not None) otherwise random sampling, see docs + tp.random_sample_datetime_start = None + + # saving, logging etc. + tp.save_model_each = 10000 + tp.update_tensorboard_freq = 256 + + li_attr_obs_X = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q", + "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line", + "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status"] + + # nn architecture + observation_size = SACOld_NNParam.get_obs_size(env_init, li_attr_obs_X) + sizes_q = [800, 800, 800, 494, 494, 494] # sizes of each hidden layers + sizes_v = [800, 800] # sizes of each hidden layers + sizes_pol = [800, 800, 800, 494, 494, 494] # sizes of each hidden layers + kwargs_archi = {'observation_size': observation_size, + 'sizes': sizes_q, + 'activs': ["relu" for _ in range(len(sizes_q))], + "list_attr_obs": li_attr_obs_X, + "sizes_value": sizes_v, + "activs_value": ["relu" for _ in range(len(sizes_v))], + "sizes_policy": sizes_pol, + "activs_policy": ["relu" for _ in range(len(sizes_pol))] + } + + # which actions i keep + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_bus_vect": True, + "set_topo_vect": False, + } + nm_ = args.name if args.name is not None else DEFAULT_NAME + try: + train(env, + name=nm_, + iterations=args.num_train_steps, + save_path=args.save_path, + load_path=args.load_path, + logs_dir=args.logs_dir, + training_param=tp, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi) + finally: + env.close() + if args.nb_env > 1: + env_init.close() diff --git a/l2rpn_baselines/__init__.py b/l2rpn_baselines/__init__.py index 6a9b047..54668e5 100644 --- a/l2rpn_baselines/__init__.py +++ b/l2rpn_baselines/__init__.py @@ -7,6 +7,9 @@ "DeepQSimple", "DuelQSimple", "SAC", + "LeapNetEncoded", + # Backward compatibility + "SACOld", # contribution "PandapowerOPFAgent", "Geirina", diff --git a/l2rpn_baselines/test/test_import.py b/l2rpn_baselines/test/test_import.py index b022062..ba16d6b 100644 --- a/l2rpn_baselines/test/test_import.py +++ b/l2rpn_baselines/test/test_import.py @@ -50,6 +50,16 @@ def load_module(self): return "SAC" +class TestSACOld(TestImport, unittest.TestCase): + def load_module(self): + return "SACOld" + + +class TestLeapNetEnc(TestImport, unittest.TestCase): + def load_module(self): + return "LeapNetEncoded" + + class TestDuelQSimple(TestImport, unittest.TestCase): def load_module(self): return "DuelQSimple" diff --git a/l2rpn_baselines/test/test_train_eval.py b/l2rpn_baselines/test/test_train_eval.py index 9ba77dc..b9f91a5 100644 --- a/l2rpn_baselines/test/test_train_eval.py +++ b/l2rpn_baselines/test/test_train_eval.py @@ -23,10 +23,14 @@ from l2rpn_baselines.DeepQSimple import evaluate as eval_dqn from l2rpn_baselines.DuelQSimple import train as train_d3qs from l2rpn_baselines.DuelQSimple import evaluate as eval_d3qs +from l2rpn_baselines.SACOld import train as train_sacold +from l2rpn_baselines.SACOld import evaluate as eval_sacold from l2rpn_baselines.SAC import train as train_sac from l2rpn_baselines.SAC import evaluate as eval_sac from l2rpn_baselines.DuelQLeapNet import train as train_leap from l2rpn_baselines.DuelQLeapNet import evaluate as eval_leap +from l2rpn_baselines.LeapNetEncoded import train as train_leapenc +from l2rpn_baselines.LeapNetEncoded import evaluate as eval_leapenc from l2rpn_baselines.DoubleDuelingDQN import train as train_d3qn from l2rpn_baselines.DoubleDuelingDQN import evaluate as eval_d3qn from l2rpn_baselines.DoubleDuelingDQN import DoubleDuelingDQNConfig as d3qn_cfg @@ -273,6 +277,62 @@ def test_train_eval(self): kwargs_archi=kwargs_archi) baseline_2 = eval_d3qs(env, + name=nm_, + load_path=tmp_dir, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=30, + verbose=False, + save_gif=False) + + +class TestSACOld(unittest.TestCase): + def test_train_eval(self): + tp = TrainingParam() + tp.buffer_size = 100 + tp.minibatch_size = 8 + tp.update_freq = 32 + tp.min_observation = 32 + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + li_attr_obs_X = ["prod_p", "load_p", "rho"] + + # neural network architecture + observation_size = NNParam.get_obs_size(env, li_attr_obs_X) + sizes_q = [100, 50, 10] # sizes of each hidden layers + sizes_v = [100, 100] # sizes of each hidden layers + sizes_pol = [100, 10] # sizes of each hidden layers + kwargs_archi = {'observation_size': observation_size, + 'sizes': sizes_q, + 'activs': ["relu" for _ in range(len(sizes_q))], + "list_attr_obs": li_attr_obs_X, + "sizes_value": sizes_v, + "activs_value": ["relu" for _ in range(len(sizes_v))], + "sizes_policy": sizes_pol, + "activs_policy": ["relu" for _ in range(len(sizes_pol))] + } + + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_bus_vect": True, + "set_topo_vect": False + } + nm_ = "AnneOnymous" + train_sacold(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_dir=tmp_dir, + training_param=tp, + verbose=False, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi) + + baseline_2 = eval_sacold(env, name=nm_, load_path=tmp_dir, logs_path=tmp_dir, @@ -396,6 +456,78 @@ def test_train_eval(self): save_gif=False) +class TestLeapNetEncoded(unittest.TestCase): + def test_train_eval(self): + tp = TrainingParam() + tp.buffer_size = 100 + tp.minibatch_size = 8 + tp.update_freq = 32 + tp.min_observation = 32 + tmp_dir = tempfile.mkdtemp() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + env = grid2op.make("rte_case5_example", test=True) + kwargs_converters = {"all_actions": None, + "set_line_status": False, + "change_line_status": True, + "change_bus_vect": False, + "set_topo_vect": False, + "redispacth": False + } + + # nn architecture + li_attr_obs_X = ["prod_p", "prod_v", "load_p", "load_q"] + li_attr_obs_input_q = ["time_before_cooldown_line", + "time_before_cooldown_sub", + "actual_dispatch", + "target_dispatch", + "day_of_week", + "hour_of_day", + "minute_of_hour", + "rho"] + li_attr_obs_Tau = ["line_status", "timestep_overflow"] + list_attr_gm_out = ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex", "prod_q", "load_v"] + li_attr_obs_X + + kwargs_archi = {'sizes': [], + 'activs': [], + 'x_dim': -1, + + "list_attr_obs": li_attr_obs_X, + "list_attr_obs_tau": li_attr_obs_Tau, + "list_attr_obs_x": li_attr_obs_X, + "list_attr_obs_input_q": li_attr_obs_input_q, + "list_attr_obs_gm_out": list_attr_gm_out, + + 'dim_topo': env.dim_topo, + + "sizes_enc": (10, 10, 10, 10), + "sizes_main": (50, ), + "sizes_out_gm": (50,), + "sizes_Qnet": (50, 50, ) + } + nm_ = "AnneOnymous" + train_leapenc(env, + name=nm_, + iterations=100, + save_path=tmp_dir, + load_path=None, + logs_dir=tmp_dir, + training_param=tp, + verbose=False, + kwargs_converters=kwargs_converters, + kwargs_archi=kwargs_archi) + + baseline_2 = eval_leapenc(env, + name=nm_, + load_path=tmp_dir, + logs_path=tmp_dir, + nb_episode=1, + nb_process=1, + max_steps=30, + verbose=False, + save_gif=False) + + class TestD3QN(unittest.TestCase): def test_train_eval(self): tmp_dir = tempfile.mkdtemp() diff --git a/l2rpn_baselines/utils/BaseDeepQ.py b/l2rpn_baselines/utils/BaseDeepQ.py index a9c54d3..5a1f1cd 100644 --- a/l2rpn_baselines/utils/BaseDeepQ.py +++ b/l2rpn_baselines/utils/BaseDeepQ.py @@ -108,38 +108,60 @@ def construct_q_network(self): """ raise NotImplementedError("Not implemented") - def predict_movement(self, data, epsilon, batch_size=None): + def predict_movement(self, data, epsilon, batch_size=None, training=False): """ Predict movement of game controler where is epsilon probability randomly move.""" if batch_size is None: batch_size = data.shape[0] - rand_val = np.random.random(batch_size) - q_actions = self._model.predict(data, batch_size=batch_size) - - opt_policy = np.argmax(np.abs(q_actions), axis=-1) - opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon))) - return opt_policy, q_actions[0, opt_policy] + # q_actions = self._model.predict(data, batch_size=batch_size) # q value of each action + q_actions = self._model(data, training=training).numpy() + opt_policy = np.argmax(q_actions, axis=-1) + if epsilon > 0.: + rand_val = np.random.random(batch_size) + opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon))) + return opt_policy, q_actions[np.arange(batch_size), opt_policy], q_actions def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None): - """Trains network to fit given parameters""" + """Trains network to fit given parameters: + Parameters + ---------- + s_batch: + the state vector (before the action is taken) + a_batch: + the action taken + s2_batch: + the state vector (after the action is taken) + d_batch: + says whether or not the episode was over + r_batch: + the reward obtained this step + + see https://towardsdatascience.com/dueling-double-deep-q-learning-using-tensorflow-2-x-7bbbcec06a2a + for the update rules + """ if batch_size is None: batch_size = s_batch.shape[0] # Save the graph just the first time if tf_writer is not None: tf.summary.trace_on() - targets = self._model.predict(s_batch, batch_size=batch_size) + target = self._model(s_batch, training=True).numpy() + fut_action = self._model(s2_batch, training=True).numpy() if tf_writer is not None: with tf_writer.as_default(): tf.summary.trace_export("model-graph", 0) tf.summary.trace_off() - fut_action = self._target_model.predict(s2_batch, batch_size=batch_size) - - targets[:, a_batch.flatten()] = r_batch - targets[d_batch, a_batch[d_batch]] += self._training_param.discount_factor * np.max(fut_action[d_batch], axis=-1) - - loss = self.train_on_batch(self._model, self._optimizer_model, s_batch, targets) + target_next = self._target_model(s2_batch, training=True).numpy() + + idx = np.arange(batch_size) + target[idx, a_batch] = r_batch + # update the value for not done episode + nd_batch = ~d_batch # update with this rule only batch that did not game over + next_a = np.argmax(fut_action, axis=-1) # compute the future action i will take in the next state + fut_Q = target_next[idx, next_a] # get its Q value + target[nd_batch, a_batch[nd_batch]] += self._training_param.discount_factor * fut_Q[nd_batch] + loss = self.train_on_batch(self._model, self._optimizer_model, s_batch, target) return loss def train_on_batch(self, model, optimizer_model, x, y_true): @@ -213,14 +235,22 @@ def load_network(self, path, name=None, ext="h5"): if self.verbose: print("Succesfully loaded network.") - def target_train(self): + def target_train(self, tau=None): """ update the target model with the parameters given in the :attr:`BaseDeepQ._training_param`. """ - # nothing has changed from the original implementation - model_weights = self._model.get_weights() - target_model_weights = self._target_model.get_weights() - for i in range(len(model_weights)): - target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \ - target_model_weights[i] - self._target_model.set_weights(target_model_weights) \ No newline at end of file + if tau is None: + tau = self._training_param.tau + tau_inv = 1.0 - tau + + target_params = self._target_model.trainable_variables + source_params = self._model.trainable_variables + for src, dest in zip(source_params, target_params): + # Polyak averaging + var_update = src.value() * tau + var_persist = dest.value() * tau_inv + dest.assign(var_update + var_persist) + + def save_tensorboard(self, current_step): + """function used to save other information to tensorboard""" + pass diff --git a/l2rpn_baselines/utils/DeepQAgent.py b/l2rpn_baselines/utils/DeepQAgent.py index 18637a4..2bcfb25 100644 --- a/l2rpn_baselines/utils/DeepQAgent.py +++ b/l2rpn_baselines/utils/DeepQAgent.py @@ -135,9 +135,6 @@ def __init__(self, self.store_action = store_action self.dict_action = {} self.istraining = istraining - self._actions_per_1000steps = np.zeros((1000, self.action_space.size()), dtype=np.int) - self._illegal_actions_per_1000steps = np.zeros(1000, dtype=np.int) - self._ambiguous_actions_per_1000steps = np.zeros(1000, dtype=np.int) self.epsilon = 1.0 # for tensorbaord @@ -166,9 +163,6 @@ def __init__(self, # this is for the "limit the episode length" depending on your previous success self._total_sucesses = 0 - # update frequency of action types - self._nb_updated_act_tensorboard = None - # neural network architecture self._nn_archi = nn_archi @@ -182,6 +176,23 @@ def __init__(self, else: self.init_obs_extraction(observation_space) + # for the frequency of action type + self.current_ = 0 + self.nb_ = 10 + self._nb_this_time = np.zeros((self.nb_, 6)) + + # + self._vector_size = None + self._actions_per_ksteps = None + self._illegal_actions_per_ksteps = None + self._ambiguous_actions_per_ksteps = None + + def _fill_vectors(self, training_param): + self._vector_size = self.nb_ * training_param.update_tensorboard_freq + self._actions_per_ksteps = np.zeros((self._vector_size, self.action_space.size()), dtype=np.int) + self._illegal_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int) + self._ambiguous_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int) + # grid2op.Agent interface def convert_obs(self, observation): """ @@ -232,7 +243,9 @@ def my_act(self, transformed_observation, reward, done=False): The id the action taken. """ - predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation, epsilon=0.0) + predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation, + epsilon=0.0, + training=False) res = int(predict_movement_int) self._store_action_played(res) return res @@ -389,6 +402,7 @@ def train(self, else: training_param = self._training_param self._init_deep_q(self._training_param, env) + self._fill_vectors(self._training_param) self._init_replay_buffer() @@ -410,7 +424,6 @@ def train(self, UPDATE_FREQ = training_param.update_tensorboard_freq # update tensorboard every "UPDATE_FREQ" steps SAVING_NUM = training_param.save_model_each - if hasattr(env, "nb_env"): nb_env = env.nb_env warnings.warn("Training using {} environments".format(nb_env)) @@ -478,8 +491,6 @@ def train(self, self._prev_id = 0 # this is for the "limit the episode length" depending on your previous success self._total_sucesses = 0 - # update the frequency of action types - self._nb_updated_act_tensorboard = 0 with tqdm(total=iterations - training_step, disable=not self.verbose) as pbar: while training_step < iterations: @@ -491,7 +502,7 @@ def train(self, self.epsilon = self._training_param.get_next_epsilon(current_step=training_step) # then we need to predict the next moves. Agents have been adapted to predict a batch of data - pm_i, pq_v, act = self._next_move(initial_state, self.epsilon) + pm_i, pq_v, act = self._next_move(initial_state, self.epsilon, training_step) # todo store the illegal / ambiguous / ... actions reward, done = self._init_local_train_loop() @@ -501,7 +512,6 @@ def train(self, act = act[0] temp_observation_obj, temp_reward, temp_done, info = env.step(act) - if self.__nb_env == 1: # dirty hack to wrap them into list temp_observation_obj = [temp_observation_obj] @@ -530,7 +540,6 @@ def train(self, alive_frames[epoch_num] = np.mean(alive_frame) total_rewards[epoch_num] = np.mean(total_reward) self._store_action_played_train(training_step, pm_i) - self._save_tensorboard(training_step, epoch_num, UPDATE_FREQ, total_rewards, alive_frames) training_step += 1 pbar.update(1) @@ -549,36 +558,40 @@ def _convert_obs_train(self, observations): self._obs_as_vect[i, :] = self.convert_obs(obs).reshape(-1) return self._obs_as_vect + def _create_action_if_not_registered(self, action_int): + """make sure that `action_int` is present in dict_action""" + if action_int not in self.dict_action: + act = self.action_space.all_actions[action_int] + is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn = False, False, False, False, False, False + try: + # feature unavailble in grid2op <= 0.9.2 + is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types() + is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp) + except Exception as exc_: + pass + + self.dict_action[action_int] = [0, act, + (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn)] + def _store_action_played(self, action_int): """if activated, this function will store the action taken by the agent.""" if self.store_action: - if action_int not in self.dict_action: - act = self.action_space.all_actions[action_int] - is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn = False, False, False, False, False, False - try: - # feature unavailble in grid2op <= 0.9.2 - is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types() - is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp) - except Exception as exc_: - pass - - self.dict_action[action_int] = [0, act, - (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn)] - self.dict_action[action_int][0] += 1 - - (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn) = self.dict_action[action_int][2] - if is_inj: - self.nb_injection += 1 - if is_volt: - self.nb_voltage += 1 - if is_topo: - self.nb_topology += 1 - if is_line_status: - self.nb_line += 1 - if is_redisp: - self.nb_redispatching += 1 - if is_dn: - self.nb_do_nothing += 1 + self._create_action_if_not_registered(action_int) + + self.dict_action[action_int][0] += 1 + (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_dn) = self.dict_action[action_int][2] + if is_inj: + self.nb_injection += 1 + if is_volt: + self.nb_voltage += 1 + if is_topo: + self.nb_topology += 1 + if is_line_status: + self.nb_line += 1 + if is_redisp: + self.nb_redispatching += 1 + if is_dn: + self.nb_do_nothing += 1 def _convert_all_act(self, act_as_integer): """this function converts the action given as a list of integer. It ouputs a list of valid grid2op Action""" @@ -611,6 +624,7 @@ def _train_model(self, training_step): self._training_param.tell_step(training_step) if training_step > max(self._training_param.min_observation, self._training_param.minibatch_size) and \ self._training_param.do_train(): + # train the model s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample(self._training_param.minibatch_size) tf_writer = None @@ -630,14 +644,15 @@ def _train_model(self, training_step): def _updage_illegal_ambiguous(self, curr_step, info): """update the conunt of illegal and ambiguous actions""" - self._illegal_actions_per_1000steps[curr_step % 1000] = np.sum([el["is_illegal"] for el in info]) - self._ambiguous_actions_per_1000steps[curr_step % 1000] = np.sum([el["is_ambiguous"] for el in info]) + tmp_ = curr_step % self._vector_size + self._illegal_actions_per_ksteps[tmp_] = np.sum([el["is_illegal"] for el in info]) + self._ambiguous_actions_per_ksteps[tmp_] = np.sum([el["is_ambiguous"] for el in info]) def _store_action_played_train(self, training_step, action_id): """store which action were played, for tensorboard only.""" - which_row = training_step % 1000 - self._actions_per_1000steps[which_row, :] = 0 - self._actions_per_1000steps[which_row, action_id] += 1 + which_row = training_step % self._vector_size + self._actions_per_ksteps[which_row, :] = 0 + self._actions_per_ksteps[which_row, action_id] += 1 def _fast_forward_env(self, env, time=7*24*60/5): """use this functio to skip some time steps when environment is reset.""" @@ -674,7 +689,8 @@ def _reset_env_clean_state(self, env): def _need_reset(self, env, observation_num, epoch_num, done, new_state): """perform the proper reset of the environment""" - if self._training_param.step_increase_nb_iter > 0: + if self._training_param.step_increase_nb_iter is not None and \ + self._training_param.step_increase_nb_iter > 0: self._max_iter_env(min(max(self._training_param.min_iter, self._training_param.max_iter_fun(self._total_sucesses)), self._training_param.max_iter)) # TODO @@ -748,19 +764,27 @@ def _init_replay_buffer(self): def _store_new_state(self, initial_state, predict_movement_int, reward, done, new_state): """store the new state in the replay buffer""" # vectorized version of the previous code - for i_s, pm_i, reward, done, new_state in zip(initial_state, predict_movement_int, reward, done, new_state): + for i_s, pm_i, reward, done, ns in zip(initial_state, predict_movement_int, reward, done, new_state): self.replay_buffer.add(i_s, pm_i, reward, done, - new_state) + ns) def _max_iter_env(self, new_max_iter): """update the number of maximum iteration allowed.""" self._max_iter_env_ = new_max_iter - def _next_move(self, curr_state, epsilon): - pm_i, pq_v = self.deep_q.predict_movement(curr_state, epsilon) + def _next_move(self, curr_state, epsilon, training_step): + # supposes that 0 encodes for do nothing, otherwise it will NOT work (for the observer) + pm_i, pq_v, q_actions = self.deep_q.predict_movement(curr_state, epsilon, training=True) + + if self._training_param.min_observe is not None and \ + training_step < self._training_param.min_observe: + # action is replaced by do nothing due to the "observe only" specification + pm_i[:] = 0 + pq_v[:] = q_actions[:, 0] + # TODO implement the "max XXX random action per scenarios" act = self._convert_all_act(pm_i) return pm_i, pq_v, act @@ -839,12 +863,12 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a mean_reward_100 = mean_reward mean_alive_100 = mean_alive - tmp = self._actions_per_1000steps > 0 + tmp = self._actions_per_ksteps > 0 tmp = tmp.sum(axis=0) - nb_action_taken_last_1000_step = np.sum(tmp > 0) + nb_action_taken_last_kstep = np.sum(tmp > 0) - nb_illegal_act = np.sum(self._illegal_actions_per_1000steps) - nb_ambiguous_act = np.sum(self._ambiguous_actions_per_1000steps) + nb_illegal_act = np.sum(self._illegal_actions_per_ksteps) + nb_ambiguous_act = np.sum(self._ambiguous_actions_per_ksteps) if epoch_num >= 100: mean_reward_100 = np.nanmean(epoch_rewards[(epoch_num-100):epoch_num]) @@ -861,70 +885,119 @@ def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_a # show first the Mean reward and mine time alive (hence the upper case) tf.summary.scalar("Mean_alive_30", mean_alive_30, step_tb, - description="Mean reward over the last 30 epochs") + description="Average number of steps (per episode) made over the last 30 " + "completed episodes") tf.summary.scalar("Mean_reward_30", mean_reward_30, step_tb, - description="Mean number of timesteps sucessfully manage the last 30 epochs") + description="Average (final) reward obtained over the last 30 completed episodes") # then it's alpha numerical order, hence the "z_" in front of some information tf.summary.scalar("loss", self._losses[step], step_tb, - description="last training loss") + description="Training loss (for the last training batch)") tf.summary.scalar("last_alive", last_alive, step_tb, - description="last number of timestep during which the agent stayed alive") + description="Final number of steps for the last complete episode") tf.summary.scalar("last_reward", last_reward, step_tb, - description="last reward get by the agent") + description="Final reward over the last complete episode") - tf.summary.scalar("mean_reward", mean_reward, step_tb) - tf.summary.scalar("mean_alive", mean_alive, step_tb) + tf.summary.scalar("mean_reward", mean_reward, step_tb, + description="Average reward over the whole episodes played") + tf.summary.scalar("mean_alive", mean_alive, step_tb, + description="Average time alive over the whole episodes played") tf.summary.scalar("mean_reward_100", mean_reward_100, step_tb, - description="Mean reward over the last 100 epochs") + description="Average number of steps (per episode) made over the last 100 " + "completed episodes") tf.summary.scalar("mean_alive_100", mean_alive_100, step_tb, - description="Mean number of timesteps sucessfully manage the last 100 epochs") + description="Average (final) reward obtained over the last 100 completed episodes") - tf.summary.scalar("nb_differentaction_taken_1000", nb_action_taken_last_1000_step, step_tb, - description="Number of different actions played the past 1000 steps") + tf.summary.scalar("nb_different_action_taken", nb_action_taken_last_kstep, step_tb, + description="Number of different actions played the last " + "{} steps".format(self.nb_ * UPDATE_FREQ)) tf.summary.scalar("nb_illegal_act", nb_illegal_act, step_tb, - description="Number of illegal actions played the past 1000 steps") + description="Number of illegal actions played the last " + "{} steps".format(self.nb_ * UPDATE_FREQ)) tf.summary.scalar("nb_ambiguous_act", nb_ambiguous_act, step_tb, - description="Number of ambiguous actions played the past 1000 steps") + description="Number of ambiguous actions played the last " + "{} steps".format(self.nb_ * UPDATE_FREQ)) tf.summary.scalar("nb_total_success", self._total_sucesses, step_tb, - description="Number of times I reach the end of scenario (no game over)") + description="Number of times the episode was completed entirely " + "(no game over)") tf.summary.scalar("z_lr", self._train_lr, step_tb, - description="current learning rate") + description="Current learning rate") tf.summary.scalar("z_epsilon", self.epsilon, step_tb, - description="current epsilon (of the epsilon greedy)") + description="Current epsilon (from the epsilon greedy)") tf.summary.scalar("z_max_iter", self._max_iter_env_, step_tb, - description="maximum number of time steps before deciding a scenario is over (=win)") + description="Maximum number of time steps before deciding a scenario " + "is over (=win)") tf.summary.scalar("z_total_episode", epoch_num, step_tb, - description="total number of episode played (~number of \"reset\")") + description="Total number of episode played (number of \"reset\")") + + self.deep_q.save_tensorboard(step_tb) if self.store_action: - nb_ = 10 # reset the frequencies every nb_ saving - self._nb_updated_act_tensorboard += UPDATE_FREQ - tf.summary.scalar("zz_freq_inj", self.nb_injection / self._nb_updated_act_tensorboard, step_tb) - tf.summary.scalar("zz_freq_voltage", self.nb_voltage / self._nb_updated_act_tensorboard, step_tb) - tf.summary.scalar("z_freq_topo", self.nb_topology / self._nb_updated_act_tensorboard, step_tb) - tf.summary.scalar("z_freq_line_status", self.nb_line / self._nb_updated_act_tensorboard, step_tb) - tf.summary.scalar("z_freq_redisp", self.nb_redispatching / self._nb_updated_act_tensorboard, step_tb) - tf.summary.scalar("z_freq_do_nothing", self.nb_do_nothing / self._nb_updated_act_tensorboard, step_tb) - if step % (nb_ * UPDATE_FREQ) == 0: - self.nb_injection = 0 - self.nb_voltage = 0 - self.nb_topology = 0 - self.nb_line = 0 - self.nb_redispatching = 0 - self.nb_do_nothing = 0 - self._nb_updated_act_tensorboard = 0 + self._store_frequency_action_type(UPDATE_FREQ, step_tb) if self._time_step_lived is not None: tf.summary.histogram( "timestep_lived", self._time_step_lived, step=step_tb, buckets=None, - description="number of time steps lived for all scenarios" + description="Number of time steps lived for all scenarios" ) if self._nb_chosen is not None: tf.summary.histogram( "nb_chosen", self._nb_chosen, step=step_tb, buckets=None, - description="number of times this scenarios has been played" + description="Number of times this scenarios has been played" ) + + def _store_frequency_action_type(self, UPDATE_FREQ, step_tb): + self.current_ += 1 + self.current_ %= self.nb_ + nb_inj, nb_volt, nb_topo, nb_line, nb_redisp, nb_dn = self._nb_this_time[self.current_, :] + self._nb_this_time[self.current_, :] = [self.nb_injection, self.nb_voltage, + self.nb_topology, self.nb_line, + self.nb_redispatching, self.nb_do_nothing] + + curr_inj = self.nb_injection - nb_inj + curr_volt = self.nb_voltage - nb_volt + curr_topo = self.nb_topology - nb_topo + curr_line = self.nb_line - nb_line + curr_redisp = self.nb_redispatching - nb_redisp + curr_dn = self.nb_do_nothing - nb_dn + + total_act_num = curr_inj + curr_volt + curr_topo + curr_line + curr_redisp + curr_dn + tf.summary.scalar("zz_freq_inj", + curr_inj / total_act_num, + step_tb, + description="Frequency of \"injection\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) + tf.summary.scalar("zz_freq_voltage", + curr_volt / total_act_num, + step_tb, + description="Frequency of \"voltage\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) + tf.summary.scalar("z_freq_topo", + curr_topo / total_act_num, + step_tb, + description="Frequency of \"topo\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) + tf.summary.scalar("z_freq_line_status", + curr_line / total_act_num, + step_tb, + description="Frequency of \"line status\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) + tf.summary.scalar("z_freq_redisp", + curr_redisp / total_act_num, + step_tb, + description="Frequency of \"redispatching\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) + tf.summary.scalar("z_freq_do_nothing", + curr_dn / total_act_num, + step_tb, + description="Frequency of \"do nothing\" actions " + "type played over the last {} actions" + "".format(self.nb_ * UPDATE_FREQ)) diff --git a/l2rpn_baselines/utils/NNParam.py b/l2rpn_baselines/utils/NNParam.py index f6df6c1..8294d9a 100644 --- a/l2rpn_baselines/utils/NNParam.py +++ b/l2rpn_baselines/utils/NNParam.py @@ -7,6 +7,10 @@ # This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions. import os import json +import numpy as np +from collections.abc import Iterable + +import grid2op from l2rpn_baselines.utils.BaseDeepQ import BaseDeepQ @@ -115,13 +119,38 @@ def to_dict(self): for attr_nm in self._list_float: tmp = getattr(self, attr_nm) - res[attr_nm] = [float(el) for el in tmp] + res[attr_nm] = self._convert_list_to_json(tmp, float) for attr_nm in self._list_int: tmp = getattr(self, attr_nm) - res[attr_nm] = [int(el) for el in tmp] + res[attr_nm] = self._convert_list_to_json(tmp, int) for attr_nm in self._list_str: tmp = getattr(self, attr_nm) - res[attr_nm] = [str(el) for el in tmp] + res[attr_nm] = self._convert_list_to_json(tmp, str) + return res + + @classmethod + def _convert_list_to_json(cls, obj, type_): + if isinstance(obj, type_): + res = obj + elif isinstance(obj, np.ndarray): + if len(obj.shape) == 1: + res = [type_(el) for el in obj] + else: + res = [cls._convert_list_to_json(el, type_) for el in obj] + elif isinstance(obj, Iterable): + res = [cls._convert_list_to_json(el, type_) for el in obj] + else: + res = type_(obj) + return res + + @classmethod + def _attr_from_json(cls, json, type_): + if isinstance(json, type_): + res = json + elif isinstance(json, list): + res = [cls._convert_list_to_json(obj=el, type_=type_) for el in json] + else: + res = type_(json) return res @classmethod @@ -155,13 +184,13 @@ def from_dict(cls, tmp): for attr_nm in cls._list_float: if attr_nm in tmp: - cls_as_dict[attr_nm] = [float(el) for el in tmp[attr_nm]] + cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], float) for attr_nm in cls._list_int: if attr_nm in tmp: - cls_as_dict[attr_nm] = [int(el) for el in tmp[attr_nm]] + cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], int) for attr_nm in cls._list_str: if attr_nm in tmp: - cls_as_dict[attr_nm] = [str(el) for el in tmp[attr_nm]] + cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], str) res = cls(**cls_as_dict) return res @@ -189,3 +218,60 @@ def save_as_json(self, path, name=None): path_out = os.path.join(path, name) with open(path_out, "w", encoding="utf-8") as f: json.dump(res, fp=f, indent=4, sort_keys=True) + + def center_reduce(self, env): + """currently not implemented for this class, "coming soon" as we might say""" + # TODO see TestLeapNet for this feature + self._center_reduce_vect(env.get_obs(), "x") + + def _center_reduce_vect(self, obs, nn_part): + """ + compute the xxxx_adds and xxxx_mults for one part of the neural network called nn_part, + depending on what attribute of the observation is extracted + """ + if not isinstance(obs, grid2op.Observation.BaseObservation): + # in multi processing i receive a set of observation there so i might need + # to extract only the first one + obs = obs[0] + + li_attr_obs = getattr(self, "list_attr_obs_{}".format(nn_part)) + adds = [] + mults = [] + for attr_nm in li_attr_obs: + if attr_nm in ["prod_p"]: + add_tmp = np.array([-0.5*(pmax + pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)]) + mult_tmp = np.array([1./max((pmax - pmin), 0.) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)]) + elif attr_nm in ["prod_q"]: + add_tmp = 0. + mult_tmp = np.array([1./max(abs(val), 1.0) for val in obs.prod_q]) + elif attr_nm in ["load_p", "load_q"]: + add_tmp = np.array([-val for val in getattr(obs, attr_nm)]) + mult_tmp = 0.5 + elif attr_nm in ["load_v", "prod_v", "v_or", "v_ex"]: + add_tmp = 0. + mult_tmp = np.array([1. / val for val in getattr(obs, attr_nm)]) + elif attr_nm == "hour_of_day": + add_tmp = -12. + mult_tmp = 1.0/12 + elif attr_nm == "minute_of_hour": + add_tmp = -30. + mult_tmp = 1.0/30 + elif attr_nm == "day_of_week": + add_tmp = -4. + mult_tmp = 1.0/4 + elif attr_nm == "day": + add_tmp = -15. + mult_tmp = 1.0/15. + elif attr_nm in ["target_dispatch", "actual_dispatch"]: + add_tmp = 0. + mult_tmp = np.array([1./(pmax - pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)]) + elif attr_nm in ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex"]: + add_tmp = 0. + mult_tmp = np.array([1.0 / max(val, 1.0) for val in getattr(obs, attr_nm)]) + else: + add_tmp = 0. + mult_tmp = 1.0 + mults.append(mult_tmp) + adds.append(add_tmp) + setattr(self, "{}_adds".format(nn_part), adds) + setattr(self, "{}_mults".format(nn_part), mults) diff --git a/l2rpn_baselines/utils/ReplayBuffer.py b/l2rpn_baselines/utils/ReplayBuffer.py index 68164b4..f88864a 100644 --- a/l2rpn_baselines/utils/ReplayBuffer.py +++ b/l2rpn_baselines/utils/ReplayBuffer.py @@ -12,6 +12,7 @@ from collections import deque import numpy as np import random +import copy import pdb @@ -34,6 +35,7 @@ def add(self, s, a, r, d, s2): raise RuntimeError("Infinite value somwhere in at least one of the state") experience = (s, a, r, d, s2) + experience = copy.deepcopy(experience) if self.count < self.buffer_size: self.buffer.append(experience) self.count += 1 diff --git a/l2rpn_baselines/utils/TrainingParam.py b/l2rpn_baselines/utils/TrainingParam.py index 1689dec..551ebea 100644 --- a/l2rpn_baselines/utils/TrainingParam.py +++ b/l2rpn_baselines/utils/TrainingParam.py @@ -114,11 +114,11 @@ class TrainingParam(object): _tol_float_equal = float(1e-8) _int_attr = ["buffer_size", "minibatch_size", "step_for_final_epsilon", - "min_observation", "last_step", "num_frames", "update_freq", + "min_observation", "last_step", "num_frames", "update_freq", "min_iter", "max_iter", "update_tensorboard_freq", "save_model_each", "_update_nb_iter", - "step_increase_nb_iter"] + "step_increase_nb_iter", "min_observe"] _float_attr = ["_final_epsilon", "_initial_epsilon", "lr", "lr_decay_steps", "lr_decay_rate", - "discount_factor", "tau", "oversampling_rate", + "discount_factor", "tau", "oversampling_rate", "max_global_norm_grad", "max_value_grad", "max_loss"] def __init__(self, @@ -145,7 +145,11 @@ def __init__(self, oversampling_rate=None, max_global_norm_grad=None, max_value_grad=None, - max_loss=None + max_loss=None, + + # observer: let the neural network "observe" for a given amount of time + # all actions are replaced by a do nothing + min_observe=None, ): self.random_sample_datetime_start = random_sample_datetime_start @@ -165,6 +169,9 @@ def __init__(self, self.max_value_grad = max_value_grad self.max_loss = max_loss + # observer + self.min_observe = min_observe + self.last_step = int(0) self.num_frames = int(num_frames) self.discount_factor = float(discount_factor) @@ -267,9 +274,9 @@ def to_dict(self): @staticmethod def from_dict(tmp): - """initialize this instance from a dictionnary""" + """initialize this instance from a dictionary""" if not isinstance(tmp, dict): - raise RuntimeError("TrainingParam from dict must be called with a dictionnary, and not {}".format(tmp)) + raise RuntimeError("TrainingParam from dict must be called with a dictionary, and not {}".format(tmp)) res = TrainingParam() for attr_nm in TrainingParam._int_attr: if attr_nm in tmp: