diff --git a/docs/source/usage/train.rst b/docs/source/usage/train.rst index 1b4ca19..6268862 100644 --- a/docs/source/usage/train.rst +++ b/docs/source/usage/train.rst @@ -91,7 +91,7 @@ We provide the detailed description of the command line arguments in the followi +-------------------+--------------------------------+----------------------------------------------+ | model-dir | The model dir | "" | +-------------------+--------------------------------+----------------------------------------------+ - | safety-bound | Cost_limit | 25.0 | + | cost-limit | Cost_limit | 25.0 | +-------------------+--------------------------------+----------------------------------------------+ | device | The device to run the model on | "cpu" | +-------------------+--------------------------------+----------------------------------------------+ diff --git a/safepo/multi_agent/happo.py b/safepo/multi_agent/happo.py index 52327e4..5561230 100644 --- a/safepo/multi_agent/happo.py +++ b/safepo/multi_agent/happo.py @@ -177,7 +177,7 @@ def train(self, buffer, logger): std_advantages = torch.std(advantages_copy) advantages = (advantages - mean_advantages) / (std_advantages + 1e-5) - for _ in range(self.config["ppo_epoch"]): + for _ in range(self.config["learning_iters"]): data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"]) for sample in data_generator: diff --git a/safepo/multi_agent/macpo.py b/safepo/multi_agent/macpo.py index 52793f3..27a034c 100644 --- a/safepo/multi_agent/macpo.py +++ b/safepo/multi_agent/macpo.py @@ -231,7 +231,7 @@ def trpo_update(self, sample): self.policy.cost_optimizer.step() - rescale_constraint_val = (aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"]) + rescale_constraint_val = (aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"]) if rescale_constraint_val == 0: rescale_constraint_val = 1e-8 @@ -265,7 +265,7 @@ def trpo_update(self, sample): r_coef = (reward_loss_grad * b_step_dir).sum(0, keepdim=True) s_coef = (cost_loss_grad * b_step_dir).sum(0, keepdim=True) - fraction = self.config["line_search_fraction"] + fraction = self.config["step_fraction"] loss_improve = 0 B_cost_loss_grad_dot = torch.dot(B_cost_loss_grad, B_cost_loss_grad) @@ -285,7 +285,7 @@ def trpo_update(self, sample): s_coef = 1e-8 positive_Cauchy_value = ( q_coef - (r_coef ** 2) / (1e-8 + s_coef)) - whether_recover_policy_value = 2 * self.config["kl_threshold"] - ( + whether_recover_policy_value = 2 * self.config["target_kl"] - ( rescale_constraint_val ** 2) / ( 1e-8 + s_coef) if rescale_constraint_val < 0 and whether_recover_policy_value < 0: @@ -301,24 +301,24 @@ def trpo_update(self, sample): if optim_case in [3, 4]: lam = torch.sqrt( - (q_coef / (2 * self.config["kl_threshold"]))) + (q_coef / (2 * self.config["target_kl"]))) nu = torch.tensor(0) # v_coef = 0 elif optim_case in [1, 2]: LA, LB = [0, r_coef / rescale_constraint_val], [r_coef / rescale_constraint_val, np.inf] LA, LB = (LA, LB) if rescale_constraint_val < 0 else (LB, LA) proj = lambda x, L: max(L[0], min(L[1], x)) lam_a = proj(torch.sqrt(positive_Cauchy_value / whether_recover_policy_value), LA) - lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["kl_threshold"]))), LB) + lam_b = proj(torch.sqrt(q_coef / (torch.tensor(2 * self.config["target_kl"]))), LB) f_a = lambda lam: -0.5 * (positive_Cauchy_value / ( 1e-8 + lam) + whether_recover_policy_value * lam) - r_coef * rescale_constraint_val / ( 1e-8 + s_coef) - f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["kl_threshold"] * lam) + f_b = lambda lam: -0.5 * (q_coef / (1e-8 + lam) + 2 * self.config["target_kl"] * lam) lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b nu = max(0, lam * rescale_constraint_val - r_coef) / (1e-8 + s_coef) else: lam = torch.tensor(0) - nu = torch.sqrt(torch.tensor(2 * self.config["kl_threshold"]) / (1e-8 + s_coef)) + nu = torch.sqrt(torch.tensor(2 * self.config["target_kl"]) / (1e-8 + s_coef)) x_a = (1. / (lam + 1e-8)) * (g_step_dir + nu * b_step_dir) x_b = (nu * b_step_dir) @@ -339,7 +339,7 @@ def trpo_update(self, sample): flag = False fraction_coef = self.config["fraction_coef"] - for i in range(self.config["ls_step"]): + for i in range(self.config["searching_steps"]): x_norm = torch.norm(x) if x_norm > 0.5: x = x * 0.5 / x_norm @@ -367,7 +367,7 @@ def trpo_update(self, sample): available_actions_batch, active_masks_batch, new_actor=self.policy.actor, old_actor=old_actor ).mean() - if ((kl < self.config["kl_threshold"]) and (loss_improve < 0 if optim_case > 1 else True) + if ((kl < self.config["target_kl"]) and (loss_improve < 0 if optim_case > 1 else True) and (new_cost_loss.mean() - cost_loss.mean() <= max(-rescale_constraint_val, 0))): flag = True break diff --git a/safepo/multi_agent/mappo.py b/safepo/multi_agent/mappo.py index 8eef938..57bdd29 100644 --- a/safepo/multi_agent/mappo.py +++ b/safepo/multi_agent/mappo.py @@ -169,7 +169,7 @@ def train(self, buffer, logger): std_advantages = torch.std(advantages_copy) advantages = (advantages - mean_advantages) / (std_advantages + 1e-5) - for _ in range(self.config["ppo_epoch"]): + for _ in range(self.config["learning_iters"]): data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"]) for sample in data_generator: diff --git a/safepo/multi_agent/mappolag.py b/safepo/multi_agent/mappolag.py index a7a67f7..d349dae 100644 --- a/safepo/multi_agent/mappolag.py +++ b/safepo/multi_agent/mappolag.py @@ -177,7 +177,7 @@ def ppo_update(self, sample): actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"]) self.policy.actor_optimizer.step() - delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["safety_bound"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach() + delta_lamda_lagr = -((aver_episode_costs.mean() - self.config["cost_limit"]) * (1 - self.config["gamma"]) + (imp_weights * cost_adv_targ)).mean().detach() R_Relu = torch.nn.ReLU() new_lamda_lagr = R_Relu(self.lamda_lagr - (delta_lamda_lagr * self.config["lagrangian_coef_rate"])) @@ -213,7 +213,7 @@ def train(self, buffer, logger): std_cost_adv = torch.std(cost_adv_copy) cost_adv = (cost_adv - mean_cost_adv) / (std_cost_adv + 1e-8) - for _ in range(self.config["ppo_epoch"]): + for _ in range(self.config["learning_iters"]): data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"], cost_adv=cost_adv) for sample in data_generator: diff --git a/safepo/multi_agent/marl_cfg/happo/config.yaml b/safepo/multi_agent/marl_cfg/happo/config.yaml index 0e1d7c1..7ccb535 100644 --- a/safepo/multi_agent/marl_cfg/happo/config.yaml +++ b/safepo/multi_agent/marl_cfg/happo/config.yaml @@ -3,13 +3,10 @@ algorithm_name: happo experiment_name: check seed: 0 run_dir: ./runs -use_centralized_V: True -use_obs_instead_of_state: False num_env_steps: 100000000 episode_length: 75 n_rollout_threads: 1 n_eval_rollout_threads: 1 -use_linear_lr_decay: False hidden_size: 512 use_render: False recurrent_N: 1 @@ -28,11 +25,11 @@ use_popart: True use_valuenorm: True use_proper_time_limits: False -kl_threshold: 0.016 -ls_step: 10 +target_kl: 0.016 +searching_steps: 10 accept_ratio: 0.5 clip_param: 0.2 -ppo_epoch: 5 +learning_iters: 5 num_mini_batch: 1 data_chunk_length: value_loss_coef: 1 diff --git a/safepo/multi_agent/marl_cfg/ippo/config.yaml b/safepo/multi_agent/marl_cfg/ippo/config.yaml deleted file mode 100644 index c729085..0000000 --- a/safepo/multi_agent/marl_cfg/ippo/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -env_name: ippo -algorithm_name: ippo -experiment_name: check -seed: 0 -run_dir: ./runs -use_centralized_V: False # false in IPPO -use_obs_instead_of_state: False -num_env_steps: 100000000 -episode_length: 8 -n_rollout_threads: 80 -n_eval_rollout_threads: 1 -use_linear_lr_decay: False -hidden_size: 512 -use_render: False -recurrent_N: 1 -use_single_network: False - -save_interval: 1 -use_eval: False -eval_interval: 25 -log_interval: 25 -eval_episodes: 10000 - -gamma: 0.96 -gae_lambda: 0.95 -use_gae: True -use_popart: False # false in IPPO -use_valuenorm: True -use_proper_time_limits: False - -kl_threshold: 0.016 -ls_step: 10 -accept_ratio: 0.5 -clip_param: 0.2 -ppo_epoch: 5 -num_mini_batch: 1 -data_chunk_length: -value_loss_coef: 1 -entropy_coef: 0.0 -max_grad_norm: 10 -huber_delta: 10.0 -use_recurrent_policy: False -use_naive_recurrent_policy: False -use_max_grad_norm: True -use_clipped_value_loss: True -use_huber_loss: True -use_value_active_masks: False # false in IPPO -use_policy_active_masks: False # false in IPPO - -actor_lr: 5.e-4 -critic_lr: 5.e-4 -opti_eps: 1.e-5 -weight_decay: 0.0 - -gain: 0.01 -actor_gain: 0.01 -use_orthogonal: True - -use_feature_normalization: True -use_ReLU: True -stacked_frames: 1 -layer_N: 2 - -std_x_coef: 1 -std_y_coef: 0.5 - -mamujoco: - num_env_steps: 10000000 - episode_length: 1000 - n_rollout_threads: 10 - n_eval_rollout_threads: 10 - hidden_size: 64 - gamma: 0.99 diff --git a/safepo/multi_agent/marl_cfg/macpo/config.yaml b/safepo/multi_agent/marl_cfg/macpo/config.yaml index 0e3c1f0..cf5ab01 100644 --- a/safepo/multi_agent/marl_cfg/macpo/config.yaml +++ b/safepo/multi_agent/marl_cfg/macpo/config.yaml @@ -3,13 +3,10 @@ algorithm_name: macpo experiment_name: check seed: 0 run_dir: ./runs -use_centralized_V: True -use_obs_instead_of_state: False num_env_steps: 100000000 episode_length: 8 n_rollout_threads: 1 n_eval_rollout_threads: 1 -use_linear_lr_decay: False hidden_size: 512 use_render: False recurrent_N: 1 @@ -20,26 +17,26 @@ eval_interval: 25 log_interval: 25 eval_episodes: 10000 -safety_bound: 25 +cost_limit: 25 EPS: 1.e-8 safety_gamma: 0.09 -line_search_fraction: 0.5 +step_fraction: 0.5 g_step_dir_coef: 0.1 b_step_dir_coef: 0.1 -fraction_coef: 0.27 +fraction_coef: 0.1 gamma: 0.96 gae_lambda: 0.95 use_gae: True use_popart: True -use_valuenorm: False +use_valuenorm: True use_proper_time_limits: False -kl_threshold: 0.016 -ls_step: 10 +target_kl: 0.016 +searching_steps: 10 accept_ratio: 0.5 clip_param: 0.2 -ppo_epoch: 5 +learning_iters: 5 num_mini_batch: 1 data_chunk_length: value_loss_coef: 1 @@ -72,7 +69,6 @@ std_x_coef: 1 std_y_coef: 0.5 mamujoco: - use_valuenorm: True layer_N: 1 num_env_steps: 10000000 episode_length: 1000 @@ -81,7 +77,6 @@ mamujoco: hidden_size: 128 gamma: 0.99 safety_gamma: 0.2 - fraction_coef: 0.1 - kl_threshold: 0.01 - ppo_epoch: 15 + target_kl: 0.01 + learning_iters: 15 # Conjugate Gradient Iterations entropy_coef: 0.01 \ No newline at end of file diff --git a/safepo/multi_agent/marl_cfg/mappo/config.yaml b/safepo/multi_agent/marl_cfg/mappo/config.yaml index b342aeb..239a6ed 100644 --- a/safepo/multi_agent/marl_cfg/mappo/config.yaml +++ b/safepo/multi_agent/marl_cfg/mappo/config.yaml @@ -3,13 +3,10 @@ algorithm_name: mappo experiment_name: check seed: 0 run_dir: ./runs -use_centralized_V: True -use_obs_instead_of_state: False num_env_steps: 100000000 episode_length: 8 n_rollout_threads: 80 n_eval_rollout_threads: 1 -use_linear_lr_decay: False hidden_size: 512 use_render: False recurrent_N: 1 @@ -28,11 +25,11 @@ use_popart: True use_valuenorm: False use_proper_time_limits: False -kl_threshold: 0.016 -ls_step: 10 +target_kl: 0.016 +searching_steps: 10 accept_ratio: 0.5 clip_param: 0.2 -ppo_epoch: 5 +learning_iters: 5 num_mini_batch: 1 data_chunk_length: value_loss_coef: 1 diff --git a/safepo/multi_agent/marl_cfg/mappolag/config.yaml b/safepo/multi_agent/marl_cfg/mappolag/config.yaml index 452e473..8bd4355 100644 --- a/safepo/multi_agent/marl_cfg/mappolag/config.yaml +++ b/safepo/multi_agent/marl_cfg/mappolag/config.yaml @@ -3,13 +3,10 @@ algorithm_name: mappolag experiment_name: check seed: 0 run_dir: ./runs/ -use_centralized_V: True -use_obs_instead_of_state: False num_env_steps: 100000000 episode_length: 8 n_rollout_threads: 1 n_eval_rollout_threads: 1 -use_linear_lr_decay: False hidden_size: 512 use_render: False recurrent_N: 1 @@ -21,7 +18,7 @@ eval_interval: 25 log_interval: 25 eval_episodes: 10000 -safety_bound: 25 +cost_limit: 25 lagrangian_coef_rate: 1.e-5 lamda_lagr: 0.78 @@ -32,11 +29,11 @@ use_popart: True use_valuenorm: True use_proper_time_limits: False -kl_threshold: 0.016 -ls_step: 10 +target_kl: 0.016 +searching_steps: 10 accept_ratio: 0.5 clip_param: 0.2 -ppo_epoch: 5 +learning_iters: 5 num_mini_batch: 1 data_chunk_length: value_loss_coef: 1 diff --git a/safepo/single_agent/cpo.py b/safepo/single_agent/cpo.py index 1a55204..8cea569 100644 --- a/safepo/single_agent/cpo.py +++ b/safepo/single_agent/cpo.py @@ -40,6 +40,9 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +STEP_FRACTION=0.8 +CPO_SEARCHING_STEPS=15 +CONJUGATE_GRADIENT_ITERS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -117,7 +120,6 @@ def set_param_values_to_model(model: torch.nn.Module, vals: torch.Tensor) -> Non i += int(size) # increment array position assert i == len(vals), f"Lengths do not match: {i} vs. {len(vals)}" - def get_flat_gradients_from(model: torch.nn.Module) -> torch.Tensor: grads = [] for _, param in model.named_parameters(): @@ -127,7 +129,6 @@ def get_flat_gradients_from(model: torch.nn.Module) -> torch.Tensor: assert grads, "No gradients were found in model parameters." return torch.cat(grads) - def fvp( params: torch.Tensor, policy: ActorVCritic, @@ -362,7 +363,7 @@ def main(args, cfg_env=None): loss_pi_r.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) assert xHx.item() >= 0, "xHx is negative" @@ -380,7 +381,7 @@ def main(args, cfg_env=None): b_grads = get_flat_gradients_from(policy.actor) ep_costs = logger.get_stats("Metrics/EpCost") - args.cost_limit - p = conjugate_gradients(fvp, policy, fvp_obs, b_grads, 15) + p = conjugate_gradients(fvp, policy, fvp_obs, b_grads, CONJUGATE_GRADIENT_ITERS) q = xHx r = grads.dot(p) s = b_grads.dot(p) @@ -466,7 +467,7 @@ def f_b(lam: torch.Tensor) -> torch.Tensor: expected_reward_improve = grads.dot(step_direction) kl = torch.zeros(1) - for step in range(15): + for step in range(CPO_SEARCHING_STEPS): new_theta = theta_old + step_frac * step_direction set_param_values_to_model(policy.actor, new_theta) acceptance_step = step + 1 @@ -478,7 +479,7 @@ def f_b(lam: torch.Tensor) -> torch.Tensor: ratio = torch.exp(log_prob - data["log_prob"]) loss_reward = -(ratio * data["adv_r"]).mean() except ValueError: - step_frac *= 0.8 + step_frac *= STEP_FRACTION continue temp_distribution = policy.actor(data["obs"]) log_prob = temp_distribution.log_prob(data["act"]).sum(dim=-1) @@ -508,7 +509,7 @@ def f_b(lam: torch.Tensor) -> torch.Tensor: else: logger.log(f"Accept step at i={step + 1}") break - step_frac *= 0.8 + step_frac *= STEP_FRACTION else: logger.log("INFO: no suitable step found...") step_direction = torch.zeros_like(step_direction) diff --git a/safepo/single_agent/cup.py b/safepo/single_agent/cup.py index 63dbd5f..6cfa339 100644 --- a/safepo/single_agent/cup.py +++ b/safepo/single_agent/cup.py @@ -42,6 +42,8 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +CUP_LAMBDA=0.95 +CUP_NU=0.20 default_cfg = { 'hidden_sizes': [64, 64], @@ -131,6 +133,7 @@ def main(args, cfg_env=None): cost_limit=args.cost_limit, lagrangian_multiplier_init=args.lagrangian_multiplier_init, lagrangian_multiplier_lr=args.lagrangian_multiplier_lr, + lagrangian_upper_bound=CUP_NU ) # set up the logger @@ -374,7 +377,7 @@ def main(args, cfg_env=None): temp_kl = torch.distributions.kl_divergence( distribution, old_distribution_b ).sum(-1, keepdim=True) - coef = (1 - 0.99 * 0.95) / (1 - 0.99) + coef = (1 - dict_args['gamma'] * CUP_LAMBDA) / (1 - dict_args['gamma']) loss_pi_cost = ( lagrange.lagrangian_multiplier * coef * ratio * adv_b + temp_kl ).mean() diff --git a/safepo/single_agent/focops.py b/safepo/single_agent/focops.py index d22e177..62e2d9d 100644 --- a/safepo/single_agent/focops.py +++ b/safepo/single_agent/focops.py @@ -42,6 +42,8 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +FOCOPS_LAM=1.50 +FOCOPS_NU=2.00 default_cfg = { 'hidden_sizes': [64, 64], @@ -131,6 +133,7 @@ def main(args, cfg_env=None): cost_limit=args.cost_limit, lagrangian_multiplier_init=args.lagrangian_multiplier_init, lagrangian_multiplier_lr=args.lagrangian_multiplier_lr, + lagrangian_upper_bound=FOCOPS_NU, ) # set up the logger @@ -328,8 +331,8 @@ def main(args, cfg_env=None): temp_kl = torch.distributions.kl_divergence( distribution, old_distribution_b ).sum(-1, keepdim=True) - loss_pi = (temp_kl - (1 / 1.5) * ratio * adv_b) * ( - temp_kl.detach() <= 0.02 + loss_pi = (temp_kl - (1 / FOCOPS_LAM) * ratio * adv_b) * ( + temp_kl.detach() <= dict_args['target_kl'] ).type(torch.float32) loss_pi = loss_pi.mean() actor_optimizer.zero_grad() diff --git a/safepo/single_agent/natural_pg.py b/safepo/single_agent/natural_pg.py index a29c23e..57bd8c1 100644 --- a/safepo/single_agent/natural_pg.py +++ b/safepo/single_agent/natural_pg.py @@ -40,6 +40,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +CONJUGATE_GRADIENT_ITERS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -367,7 +368,7 @@ def main(args, cfg_env=None): loss_pi.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) assert xHx.item() >= 0, "xHx is negative" diff --git a/safepo/single_agent/pcpo.py b/safepo/single_agent/pcpo.py index 9f8d696..e317f77 100644 --- a/safepo/single_agent/pcpo.py +++ b/safepo/single_agent/pcpo.py @@ -40,6 +40,9 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +STEP_FRACTION=0.8 +PCPO_SEARCHING_STEPS=200 +CONJUGATE_GRADIENT_ITERS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -362,7 +365,7 @@ def main(args, cfg_env=None): loss_pi_r.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) H_inv_g = fvp(x, policy, fvp_obs) @@ -381,7 +384,7 @@ def main(args, cfg_env=None): b_grads = get_flat_gradients_from(policy.actor) ep_costs = logger.get_stats("Metrics/EpCost") - args.cost_limit - p = conjugate_gradients(fvp, policy, fvp_obs, b_grads, 15) + p = conjugate_gradients(fvp, policy, fvp_obs, b_grads, CONJUGATE_GRADIENT_ITERS) q = xHx r = grads.dot(p) s = b_grads.dot(p) @@ -401,7 +404,7 @@ def main(args, cfg_env=None): expected_reward_improve = grads.dot(step_direction) kl = torch.zeros(1) - for step in range(200): + for step in range(PCPO_SEARCHING_STEPS): new_theta = theta_old + step_frac * step_direction set_param_values_to_model(policy.actor, new_theta) acceptance_step = step + 1 @@ -413,7 +416,7 @@ def main(args, cfg_env=None): ratio = torch.exp(log_prob - data["log_prob"]) loss_reward = -(ratio * data["adv_r"]).mean() except ValueError: - step_frac *= 0.8 + step_frac *= STEP_FRACTION continue temp_distribution = policy.actor(data["obs"]) log_prob = temp_distribution.log_prob(data["act"]).sum(dim=-1) @@ -443,7 +446,7 @@ def main(args, cfg_env=None): else: logger.log(f"Accept step at i={step + 1}") break - step_frac *= 0.8 + step_frac *= STEP_FRACTION else: logger.log("INFO: no suitable step found...") step_direction = torch.zeros_like(step_direction) diff --git a/safepo/single_agent/rcpo.py b/safepo/single_agent/rcpo.py index cd4e2d4..32d3404 100644 --- a/safepo/single_agent/rcpo.py +++ b/safepo/single_agent/rcpo.py @@ -41,6 +41,7 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +CONJUGATE_GRADIENT_ITERS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -376,7 +377,7 @@ def main(args, cfg_env=None): loss_pi.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) assert xHx.item() >= 0, "xHx is negative" diff --git a/safepo/single_agent/trpo.py b/safepo/single_agent/trpo.py index 465d343..84b43ae 100644 --- a/safepo/single_agent/trpo.py +++ b/safepo/single_agent/trpo.py @@ -40,6 +40,8 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +CONJUGATE_GRADIENT_ITERS=15 +TRPO_SEARCHING_STEPS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -369,7 +371,7 @@ def main(args, cfg_env=None): loss_pi.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) assert xHx.item() >= 0, "xHx is negative" @@ -384,7 +386,7 @@ def main(args, cfg_env=None): final_kl = 0.0 # While not within_trust_region and not out of total_steps: - for step in range(15): + for step in range(TRPO_SEARCHING_STEPS): # update theta params new_theta = theta_old + step_frac * step_direction # set new params as params of net diff --git a/safepo/single_agent/trpo_lag.py b/safepo/single_agent/trpo_lag.py index 492fb87..53d3395 100644 --- a/safepo/single_agent/trpo_lag.py +++ b/safepo/single_agent/trpo_lag.py @@ -41,6 +41,8 @@ from safepo.common.model import ActorVCritic from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params +CONJUGATE_GRADIENT_ITERS=15 +TRPO_SEARCHING_STEPS=15 default_cfg = { 'hidden_sizes': [64, 64], @@ -378,7 +380,7 @@ def main(args, cfg_env=None): loss_pi.backward() grads = -get_flat_gradients_from(policy.actor) - x = conjugate_gradients(fvp, policy, fvp_obs, grads, 15) + x = conjugate_gradients(fvp, policy, fvp_obs, grads, CONJUGATE_GRADIENT_ITERS) assert torch.isfinite(x).all(), "x is not finite" xHx = torch.dot(x, fvp(x, policy, fvp_obs)) assert xHx.item() >= 0, "xHx is negative" @@ -393,7 +395,7 @@ def main(args, cfg_env=None): final_kl = 0.0 # While not within_trust_region and not out of total_steps: - for step in range(15): + for step in range(TRPO_SEARCHING_STEPS): # update theta params new_theta = theta_old + step_frac * step_direction # set new params as params of net diff --git a/safepo/utils/config.py b/safepo/utils/config.py index 2036b8d..6abc6a2 100644 --- a/safepo/utils/config.py +++ b/safepo/utils/config.py @@ -202,7 +202,7 @@ def multi_agent_args(algo): {"name": "--experiment", "type": str, "default": "Base", "help": "Experiment name"}, {"name": "--seed", "type": int, "default":0, "help": "Random seed"}, {"name": "--model-dir", "type": str, "default": "", "help": "Choose a model dir"}, - {"name": "--safety-bound", "type": float, "default": 25.0, "help": "cost_lim"}, + {"name": "--cost-limit", "type": float, "default": 25.0, "help": "cost_lim"}, {"name": "--device", "type": str, "default": "cpu", "help": "The device to run the model on"}, {"name": "--device-id", "type": int, "default": 0, "help": "The device id to run the model on"}, {"name": "--write-terminal", "type": lambda x: bool(strtobool(x)), "default": True, "help": "Toggles terminal logging"}, @@ -241,7 +241,7 @@ def multi_agent_args(algo): cfg_train.update(cfg_train.get("mamujoco")) cfg_train["use_eval"] = args.use_eval - cfg_train["safety_bound"]=args.safety_bound + cfg_train["cost_limit"]=args.cost_limit cfg_train["algorithm_name"]=algo cfg_train["device"] = args.device + ":" + str(args.device_id)