From c74f5402b2d4ac4596e0dceb7bcccfdf0c0b0edd Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 24 Aug 2023 16:25:27 +0800 Subject: [PATCH] feat: support multi-agent multi-goal --- safepo/multi_agent/happo.py | 1 - safepo/multi_agent/macpo.py | 10 +++- safepo/multi_agent/mappo.py | 17 +++--- safepo/single_agent/benchmark.py | 2 +- safepo/single_agent/cpo.py | 97 ++++++++++++++++-------------- safepo/single_agent/cppo_pid.py | 13 ++-- safepo/single_agent/cup.py | 13 ++-- safepo/single_agent/focops.py | 17 ++++-- safepo/single_agent/natural_pg.py | 97 ++++++++++++++++-------------- safepo/single_agent/pcpo.py | 97 ++++++++++++++++-------------- safepo/single_agent/pg.py | 13 ++-- safepo/single_agent/ppo.py | 13 ++-- safepo/single_agent/ppo_lag.py | 13 ++-- safepo/single_agent/rcpo.py | 97 ++++++++++++++++-------------- safepo/single_agent/trpo.py | 97 ++++++++++++++++-------------- safepo/single_agent/trpo_lag.py | 99 ++++++++++++++++--------------- safepo/utils/config.py | 2 +- 17 files changed, 379 insertions(+), 319 deletions(-) diff --git a/safepo/multi_agent/happo.py b/safepo/multi_agent/happo.py index f471046..52327e4 100644 --- a/safepo/multi_agent/happo.py +++ b/safepo/multi_agent/happo.py @@ -467,7 +467,6 @@ def eval(self, eval_episodes=1): one_episode_costs = torch.zeros(1, self.config["n_eval_rollout_threads"], device=self.config["device"]) eval_obs, _, _ = self.eval_envs.reset() - # eval_obs = torch.as_tensor(eval_obs, dtype=torch.float32, device=self.config["device"]) eval_rnn_states = torch.zeros(self.config["n_eval_rollout_threads"], self.num_agents, self.config["recurrent_N"], self.config["hidden_size"], device=self.config["device"]) diff --git a/safepo/multi_agent/macpo.py b/safepo/multi_agent/macpo.py index 2bbb501..52793f3 100644 --- a/safepo/multi_agent/macpo.py +++ b/safepo/multi_agent/macpo.py @@ -26,12 +26,12 @@ import sys import time -from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env +from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env, make_ma_multi_goal_env from safepo.common.popart import PopArt from safepo.common.model import MultiAgentActor as Actor, MultiAgentCritic as Critic from safepo.common.buffer import SeparatedReplayBuffer from safepo.common.logger import EpochLogger -from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map +from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map, multi_agent_goal_tasks def check(input): @@ -806,6 +806,12 @@ def train(args, cfg_train): cfg_train["n_rollout_threads"] = env.num_envs cfg_train["n_eval_rollout_threads"] = env.num_envs eval_env = env + elif args.task in multi_agent_goal_tasks: + env = make_ma_multi_goal_env(task=args.task, seed=args.seed, cfg_train=cfg_train) + cfg_eval = copy.deepcopy(cfg_train) + cfg_eval["seed"] = args.seed + 10000 + cfg_eval["n_rollout_threads"] = cfg_eval["n_eval_rollout_threads"] + eval_env = make_ma_multi_goal_env(task=args.task, seed=args.seed + 10000, cfg_train=cfg_eval) else: raise NotImplementedError diff --git a/safepo/multi_agent/mappo.py b/safepo/multi_agent/mappo.py index b26dd71..8eef938 100644 --- a/safepo/multi_agent/mappo.py +++ b/safepo/multi_agent/mappo.py @@ -26,12 +26,12 @@ import sys import time -from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env +from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env, make_ma_multi_goal_env from safepo.common.popart import PopArt from safepo.common.model import MultiAgentActor as Actor, MultiAgentCritic as Critic from safepo.common.buffer import SeparatedReplayBuffer from safepo.common.logger import EpochLogger -from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map +from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map, multi_agent_goal_tasks def check(input): @@ -152,7 +152,7 @@ def ppo_update(self, sample): (policy_loss - dist_entropy * self.config["entropy_coef"]).backward() actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"]) self.policy.actor_optimizer.step() - + value_loss = self.cal_value_loss(values, value_preds_batch, return_batch, active_masks_batch) self.policy.critic_optimizer.zero_grad() (value_loss * self.config["value_loss_coef"]).backward() @@ -162,17 +162,13 @@ def ppo_update(self, sample): return value_loss, critic_grad_norm, policy_loss, dist_entropy, actor_grad_norm, imp_weights - def train(self, buffer, logger): advantages = buffer.returns[:-1] - self.value_normalizer.denormalize(buffer.value_preds[:-1]) advantages_copy = advantages.clone() - # advantages_copy[buffer.active_masks[:-1] == 0.0] = torch.nan mean_advantages = torch.mean(advantages_copy) - # std_advantages = torch.std(advantages_copy) std_advantages = torch.std(advantages_copy) advantages = (advantages - mean_advantages) / (std_advantages + 1e-5) - for _ in range(self.config["ppo_epoch"]): data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"]) @@ -488,6 +484,7 @@ def eval(self, eval_episodes=1): zeros = torch.zeros(eval_actions_collector[-1].shape[0], 1) eval_actions_collector[-1]=torch.cat((eval_actions_collector[-1], zeros), dim=1) + eval_obs, _, eval_rewards, eval_costs, eval_dones, _, _ = self.eval_envs.step( eval_actions_collector ) @@ -553,6 +550,12 @@ def train(args, cfg_train): cfg_train["n_rollout_threads"] = env.num_envs cfg_train["n_eval_rollout_threads"] = env.num_envs eval_env = env + elif args.task in multi_agent_goal_tasks: + env = make_ma_multi_goal_env(task=args.task, seed=args.seed, cfg_train=cfg_train) + cfg_eval = copy.deepcopy(cfg_train) + cfg_eval["seed"] = args.seed + 10000 + cfg_eval["n_rollout_threads"] = cfg_eval["n_eval_rollout_threads"] + eval_env = make_ma_multi_goal_env(task=args.task, seed=args.seed + 10000, cfg_train=cfg_eval) else: raise NotImplementedError diff --git a/safepo/single_agent/benchmark.py b/safepo/single_agent/benchmark.py index c7e983a..2470fb9 100644 --- a/safepo/single_agent/benchmark.py +++ b/safepo/single_agent/benchmark.py @@ -51,7 +51,7 @@ def parse_args(): "--start-seed", type=int, default=0, help="the number of the starting seed" ) parser.add_argument( - "--workers", type=int, default=16, help="the number of workers to run benchmark experimenets", + "--workers", type=int, default=8, help="the number of workers to run benchmark experimenets", ) parser.add_argument( "--experiment", type=str, default="benchmark_new_8_24", help="name of the experiment" diff --git a/safepo/single_agent/cpo.py b/safepo/single_agent/cpo.py index ca1a5d1..1a55204 100644 --- a/safepo/single_agent/cpo.py +++ b/safepo/single_agent/cpo.py @@ -40,6 +40,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -59,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -182,8 +184,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -544,14 +547,16 @@ def f_b(lam: torch.Tensor) -> torch.Tensor: ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -564,45 +569,45 @@ def f_b(lam: torch.Tensor) -> torch.Tensor: } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - logger.log_tabular("Misc/AcceptanceStep") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + logger.log_tabular("Misc/AcceptanceStep") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/single_agent/cppo_pid.py b/safepo/single_agent/cppo_pid.py index 3c315c6..2165332 100644 --- a/safepo/single_agent/cppo_pid.py +++ b/safepo/single_agent/cppo_pid.py @@ -61,6 +61,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -91,8 +92,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -304,10 +306,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 distribution = policy.actor(obs_b) log_prob = distribution.log_prob(act_b).sum(dim=-1) ratio = torch.exp(log_prob - log_prob_b) diff --git a/safepo/single_agent/cup.py b/safepo/single_agent/cup.py index 6274dfe..63dbd5f 100644 --- a/safepo/single_agent/cup.py +++ b/safepo/single_agent/cup.py @@ -62,6 +62,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -92,8 +93,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -307,10 +309,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 distribution = policy.actor(obs_b) log_prob = distribution.log_prob(act_b).sum(dim=-1) ratio = torch.exp(log_prob - log_prob_b) diff --git a/safepo/single_agent/focops.py b/safepo/single_agent/focops.py index e055540..d22e177 100644 --- a/safepo/single_agent/focops.py +++ b/safepo/single_agent/focops.py @@ -42,6 +42,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -61,6 +62,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -91,8 +93,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -312,10 +315,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 old_distribution_b = Normal(loc=old_mean_b, scale=old_std_b) distribution = policy.actor(obs_b) @@ -326,7 +330,8 @@ def main(args, cfg_env=None): ).sum(-1, keepdim=True) loss_pi = (temp_kl - (1 / 1.5) * ratio * adv_b) * ( temp_kl.detach() <= 0.02 - ).type(torch.float32).mean() + ).type(torch.float32) + loss_pi = loss_pi.mean() actor_optimizer.zero_grad() total_loss = loss_pi + 2*loss_r + loss_c \ if config.get("use_value_coefficient", False) \ diff --git a/safepo/single_agent/natural_pg.py b/safepo/single_agent/natural_pg.py index 97b6a57..a29c23e 100644 --- a/safepo/single_agent/natural_pg.py +++ b/safepo/single_agent/natural_pg.py @@ -40,6 +40,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -59,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -182,8 +184,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -343,6 +346,8 @@ def main(args, cfg_env=None): eval_end_time = time.time() + # update lagrange multiplier + ep_costs = logger.get_stats("Metrics/EpCost") # update policy data = buffer.get() @@ -409,14 +414,16 @@ def main(args, cfg_env=None): ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -429,44 +436,44 @@ def main(args, cfg_env=None): } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/single_agent/pcpo.py b/safepo/single_agent/pcpo.py index 42e0655..9f8d696 100644 --- a/safepo/single_agent/pcpo.py +++ b/safepo/single_agent/pcpo.py @@ -40,6 +40,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -59,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -182,8 +184,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -479,14 +482,16 @@ def main(args, cfg_env=None): ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -499,45 +504,45 @@ def main(args, cfg_env=None): } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - logger.log_tabular("Misc/AcceptanceStep") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + logger.log_tabular("Misc/AcceptanceStep") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/single_agent/pg.py b/safepo/single_agent/pg.py index 02258b6..131023b 100644 --- a/safepo/single_agent/pg.py +++ b/safepo/single_agent/pg.py @@ -60,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -90,8 +91,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -296,10 +298,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 distribution = policy.actor(obs_b) log_prob = distribution.log_prob(act_b).sum(dim=-1) ratio = torch.exp(log_prob - log_prob_b) diff --git a/safepo/single_agent/ppo.py b/safepo/single_agent/ppo.py index f121387..331bfd5 100644 --- a/safepo/single_agent/ppo.py +++ b/safepo/single_agent/ppo.py @@ -60,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -90,8 +91,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -296,10 +298,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 distribution = policy.actor(obs_b) log_prob = distribution.log_prob(act_b).sum(dim=-1) ratio = torch.exp(log_prob - log_prob_b) diff --git a/safepo/single_agent/ppo_lag.py b/safepo/single_agent/ppo_lag.py index da961d2..d73d96e 100644 --- a/safepo/single_agent/ppo_lag.py +++ b/safepo/single_agent/ppo_lag.py @@ -61,6 +61,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } def main(args, cfg_env=None): @@ -91,8 +92,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -305,10 +307,11 @@ def main(args, cfg_env=None): loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 distribution = policy.actor(obs_b) log_prob = distribution.log_prob(act_b).sum(dim=-1) ratio = torch.exp(log_prob - log_prob_b) diff --git a/safepo/single_agent/rcpo.py b/safepo/single_agent/rcpo.py index a6f48ad..cd4e2d4 100644 --- a/safepo/single_agent/rcpo.py +++ b/safepo/single_agent/rcpo.py @@ -41,6 +41,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -60,6 +61,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -183,8 +185,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -420,14 +423,16 @@ def main(args, cfg_env=None): ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -440,45 +445,45 @@ def main(args, cfg_env=None): } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Train/LagragianMultiplier", lagrange.lagrangian_multiplier) - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Train/LagragianMultiplier", lagrange.lagrangian_multiplier) + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/single_agent/trpo.py b/safepo/single_agent/trpo.py index 1c7b335..465d343 100644 --- a/safepo/single_agent/trpo.py +++ b/safepo/single_agent/trpo.py @@ -40,6 +40,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -59,6 +60,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -182,8 +184,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -457,14 +460,16 @@ def main(args, cfg_env=None): ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -477,45 +482,45 @@ def main(args, cfg_env=None): } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - logger.log_tabular("Misc/AcceptanceStep") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + logger.log_tabular("Misc/AcceptanceStep") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/single_agent/trpo_lag.py b/safepo/single_agent/trpo_lag.py index b7128fe..492fb87 100644 --- a/safepo/single_agent/trpo_lag.py +++ b/safepo/single_agent/trpo_lag.py @@ -41,6 +41,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params + default_cfg = { 'hidden_sizes': [64, 64], 'gamma': 0.99, @@ -60,6 +61,7 @@ 'use_value_coefficient': True, 'learning_iters': 8, 'max_grad_norm': 1.0, + 'use_critic_norm': False, } @@ -183,8 +185,9 @@ def main(args, cfg_env=None): # set training steps steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch) + total_steps = config.get("total_steps", args.total_steps) local_steps_per_epoch = steps_per_epoch // args.num_envs - epochs = args.total_steps // steps_per_epoch + epochs = total_steps // steps_per_epoch # create the actor-critic module policy = ActorVCritic( obs_dim=obs_space.shape[0], @@ -466,14 +469,16 @@ def main(args, cfg_env=None): ) in dataloader: reward_critic_optimizer.zero_grad() loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b) - cost_critic_optimizer.zero_grad() loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b) - for param in policy.reward_critic.parameters(): - loss_r += param.pow(2).sum() * 0.001 - for param in policy.cost_critic.parameters(): - loss_c += param.pow(2).sum() * 0.001 - total_loss = loss_r + loss_c + if config.get("use_critic_norm", True): + for param in policy.reward_critic.parameters(): + loss_r += param.pow(2).sum() * 0.001 + for param in policy.cost_critic.parameters(): + loss_c += param.pow(2).sum() * 0.001 + total_loss = 2*loss_r + loss_c \ + if config.get("use_value_coefficient", False) \ + else loss_r + loss_c total_loss.backward() clip_grad_norm_(policy.parameters(), config["max_grad_norm"]) reward_critic_optimizer.step() @@ -486,46 +491,46 @@ def main(args, cfg_env=None): } ) update_end_time = time.time() - - # log data - logger.log_tabular("Metrics/EpRet") - logger.log_tabular("Metrics/EpCost") - logger.log_tabular("Metrics/EpLen") - if args.use_eval: - logger.log_tabular("Metrics/EvalEpRet") - logger.log_tabular("Metrics/EvalEpCost") - logger.log_tabular("Metrics/EvalEpLen") - logger.log_tabular("Train/Epoch", epoch + 1) - logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) - logger.log_tabular("Train/KL") - logger.log_tabular("Train/LagragianMultiplier", lagrange.lagrangian_multiplier) - logger.log_tabular("Loss/Loss_reward_critic") - logger.log_tabular("Loss/Loss_cost_critic") - logger.log_tabular("Loss/Loss_actor") - logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) - if args.use_eval: - logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) - logger.log_tabular("Time/Update", update_end_time - eval_end_time) - logger.log_tabular("Time/Total", update_end_time - rollout_start_time) - logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) - logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) - logger.log_tabular("Misc/Alpha") - logger.log_tabular("Misc/FinalStepNorm") - logger.log_tabular("Misc/xHx") - logger.log_tabular("Misc/gradient_norm") - logger.log_tabular("Misc/H_inv_g") - logger.log_tabular("Misc/AcceptanceStep") - - logger.dump_tabular() - if (epoch+1) % 100 == 0 or epoch == 0: - logger.torch_save(itr=epoch) - if args.task not in isaac_gym_map.keys(): - logger.save_state( - state_dict={ - "Normalizer": env.obs_rms, - }, - itr = epoch - ) + if not logger.logged: + # log data + logger.log_tabular("Metrics/EpRet") + logger.log_tabular("Metrics/EpCost") + logger.log_tabular("Metrics/EpLen") + if args.use_eval: + logger.log_tabular("Metrics/EvalEpRet") + logger.log_tabular("Metrics/EvalEpCost") + logger.log_tabular("Metrics/EvalEpLen") + logger.log_tabular("Train/Epoch", epoch + 1) + logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch) + logger.log_tabular("Train/KL") + logger.log_tabular("Train/LagragianMultiplier", lagrange.lagrangian_multiplier) + logger.log_tabular("Loss/Loss_reward_critic") + logger.log_tabular("Loss/Loss_cost_critic") + logger.log_tabular("Loss/Loss_actor") + logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time) + if args.use_eval: + logger.log_tabular("Time/Eval", eval_end_time - eval_start_time) + logger.log_tabular("Time/Update", update_end_time - eval_end_time) + logger.log_tabular("Time/Total", update_end_time - rollout_start_time) + logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item()) + logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item()) + logger.log_tabular("Misc/Alpha") + logger.log_tabular("Misc/FinalStepNorm") + logger.log_tabular("Misc/xHx") + logger.log_tabular("Misc/gradient_norm") + logger.log_tabular("Misc/H_inv_g") + logger.log_tabular("Misc/AcceptanceStep") + + logger.dump_tabular() + if (epoch+1) % 100 == 0 or epoch == 0: + logger.torch_save(itr=epoch) + if args.task not in isaac_gym_map.keys(): + logger.save_state( + state_dict={ + "Normalizer": env.obs_rms, + }, + itr = epoch + ) logger.close() diff --git a/safepo/utils/config.py b/safepo/utils/config.py index 783e013..2036b8d 100644 --- a/safepo/utils/config.py +++ b/safepo/utils/config.py @@ -196,7 +196,7 @@ def multi_agent_args(algo): # Define custom parameters custom_parameters = [ {"name": "--use-eval", "type": lambda x: bool(strtobool(x)), "default": False, "help": "Use evaluation environment for testing"}, - {"name": "--task", "type": str, "default": "SafetyAntMultiGoal1-v0", "help": "The task to run"}, + {"name": "--task", "type": str, "default": "Safety2x4AntVelocity-v0", "help": "The task to run"}, {"name": "--agent-conf", "type": str, "default": "2x4", "help": "The agent configuration"}, {"name": "--scenario", "type": str, "default": "Ant", "help": "The scenario"}, {"name": "--experiment", "type": str, "default": "Base", "help": "Experiment name"},