Skip to content

Commit

Permalink
feat: support multi-agent multi-goal
Browse files Browse the repository at this point in the history
  • Loading branch information
Gaiejj committed Aug 24, 2023
1 parent 52d640d commit c74f540
Show file tree
Hide file tree
Showing 17 changed files with 379 additions and 319 deletions.
1 change: 0 additions & 1 deletion safepo/multi_agent/happo.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,6 @@ def eval(self, eval_episodes=1):
one_episode_costs = torch.zeros(1, self.config["n_eval_rollout_threads"], device=self.config["device"])

eval_obs, _, _ = self.eval_envs.reset()
# eval_obs = torch.as_tensor(eval_obs, dtype=torch.float32, device=self.config["device"])

eval_rnn_states = torch.zeros(self.config["n_eval_rollout_threads"], self.num_agents, self.config["recurrent_N"], self.config["hidden_size"],
device=self.config["device"])
Expand Down
10 changes: 8 additions & 2 deletions safepo/multi_agent/macpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
import sys
import time

from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env
from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env, make_ma_multi_goal_env
from safepo.common.popart import PopArt
from safepo.common.model import MultiAgentActor as Actor, MultiAgentCritic as Critic
from safepo.common.buffer import SeparatedReplayBuffer
from safepo.common.logger import EpochLogger
from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map
from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map, multi_agent_goal_tasks


def check(input):
Expand Down Expand Up @@ -806,6 +806,12 @@ def train(args, cfg_train):
cfg_train["n_rollout_threads"] = env.num_envs
cfg_train["n_eval_rollout_threads"] = env.num_envs
eval_env = env
elif args.task in multi_agent_goal_tasks:
env = make_ma_multi_goal_env(task=args.task, seed=args.seed, cfg_train=cfg_train)
cfg_eval = copy.deepcopy(cfg_train)
cfg_eval["seed"] = args.seed + 10000
cfg_eval["n_rollout_threads"] = cfg_eval["n_eval_rollout_threads"]
eval_env = make_ma_multi_goal_env(task=args.task, seed=args.seed + 10000, cfg_train=cfg_eval)
else:
raise NotImplementedError

Expand Down
17 changes: 10 additions & 7 deletions safepo/multi_agent/mappo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
import sys
import time

from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env
from safepo.common.env import make_ma_mujoco_env, make_ma_isaac_env, make_ma_multi_goal_env
from safepo.common.popart import PopArt
from safepo.common.model import MultiAgentActor as Actor, MultiAgentCritic as Critic
from safepo.common.buffer import SeparatedReplayBuffer
from safepo.common.logger import EpochLogger
from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map
from safepo.utils.config import multi_agent_args, parse_sim_params, set_np_formatting, set_seed, multi_agent_velocity_map, isaac_gym_map, multi_agent_goal_tasks


def check(input):
Expand Down Expand Up @@ -152,7 +152,7 @@ def ppo_update(self, sample):
(policy_loss - dist_entropy * self.config["entropy_coef"]).backward()
actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.config["max_grad_norm"])
self.policy.actor_optimizer.step()

value_loss = self.cal_value_loss(values, value_preds_batch, return_batch, active_masks_batch)
self.policy.critic_optimizer.zero_grad()
(value_loss * self.config["value_loss_coef"]).backward()
Expand All @@ -162,17 +162,13 @@ def ppo_update(self, sample):

return value_loss, critic_grad_norm, policy_loss, dist_entropy, actor_grad_norm, imp_weights


def train(self, buffer, logger):
advantages = buffer.returns[:-1] - self.value_normalizer.denormalize(buffer.value_preds[:-1])
advantages_copy = advantages.clone()
# advantages_copy[buffer.active_masks[:-1] == 0.0] = torch.nan
mean_advantages = torch.mean(advantages_copy)
# std_advantages = torch.std(advantages_copy)
std_advantages = torch.std(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)


for _ in range(self.config["ppo_epoch"]):
data_generator = buffer.feed_forward_generator(advantages, self.config["num_mini_batch"])

Expand Down Expand Up @@ -488,6 +484,7 @@ def eval(self, eval_episodes=1):
zeros = torch.zeros(eval_actions_collector[-1].shape[0], 1)
eval_actions_collector[-1]=torch.cat((eval_actions_collector[-1], zeros), dim=1)


eval_obs, _, eval_rewards, eval_costs, eval_dones, _, _ = self.eval_envs.step(
eval_actions_collector
)
Expand Down Expand Up @@ -553,6 +550,12 @@ def train(args, cfg_train):
cfg_train["n_rollout_threads"] = env.num_envs
cfg_train["n_eval_rollout_threads"] = env.num_envs
eval_env = env
elif args.task in multi_agent_goal_tasks:
env = make_ma_multi_goal_env(task=args.task, seed=args.seed, cfg_train=cfg_train)
cfg_eval = copy.deepcopy(cfg_train)
cfg_eval["seed"] = args.seed + 10000
cfg_eval["n_rollout_threads"] = cfg_eval["n_eval_rollout_threads"]
eval_env = make_ma_multi_goal_env(task=args.task, seed=args.seed + 10000, cfg_train=cfg_eval)
else:
raise NotImplementedError

Expand Down
2 changes: 1 addition & 1 deletion safepo/single_agent/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def parse_args():
"--start-seed", type=int, default=0, help="the number of the starting seed"
)
parser.add_argument(
"--workers", type=int, default=16, help="the number of workers to run benchmark experimenets",
"--workers", type=int, default=8, help="the number of workers to run benchmark experimenets",
)
parser.add_argument(
"--experiment", type=str, default="benchmark_new_8_24", help="name of the experiment"
Expand Down
97 changes: 51 additions & 46 deletions safepo/single_agent/cpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from safepo.common.model import ActorVCritic
from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params


default_cfg = {
'hidden_sizes': [64, 64],
'gamma': 0.99,
Expand All @@ -59,6 +60,7 @@
'use_value_coefficient': True,
'learning_iters': 8,
'max_grad_norm': 1.0,
'use_critic_norm': False,
}


Expand Down Expand Up @@ -182,8 +184,9 @@ def main(args, cfg_env=None):

# set training steps
steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch)
total_steps = config.get("total_steps", args.total_steps)
local_steps_per_epoch = steps_per_epoch // args.num_envs
epochs = args.total_steps // steps_per_epoch
epochs = total_steps // steps_per_epoch
# create the actor-critic module
policy = ActorVCritic(
obs_dim=obs_space.shape[0],
Expand Down Expand Up @@ -544,14 +547,16 @@ def f_b(lam: torch.Tensor) -> torch.Tensor:
) in dataloader:
reward_critic_optimizer.zero_grad()
loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b)

cost_critic_optimizer.zero_grad()
loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b)
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
total_loss = loss_r + loss_c
if config.get("use_critic_norm", True):
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
total_loss = 2*loss_r + loss_c \
if config.get("use_value_coefficient", False) \
else loss_r + loss_c
total_loss.backward()
clip_grad_norm_(policy.parameters(), config["max_grad_norm"])
reward_critic_optimizer.step()
Expand All @@ -564,45 +569,45 @@ def f_b(lam: torch.Tensor) -> torch.Tensor:
}
)
update_end_time = time.time()

# log data
logger.log_tabular("Metrics/EpRet")
logger.log_tabular("Metrics/EpCost")
logger.log_tabular("Metrics/EpLen")
if args.use_eval:
logger.log_tabular("Metrics/EvalEpRet")
logger.log_tabular("Metrics/EvalEpCost")
logger.log_tabular("Metrics/EvalEpLen")
logger.log_tabular("Train/Epoch", epoch + 1)
logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch)
logger.log_tabular("Train/KL")
logger.log_tabular("Loss/Loss_reward_critic")
logger.log_tabular("Loss/Loss_cost_critic")
logger.log_tabular("Loss/Loss_actor")
logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time)
if args.use_eval:
logger.log_tabular("Time/Eval", eval_end_time - eval_start_time)
logger.log_tabular("Time/Update", update_end_time - eval_end_time)
logger.log_tabular("Time/Total", update_end_time - rollout_start_time)
logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item())
logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item())
logger.log_tabular("Misc/Alpha")
logger.log_tabular("Misc/FinalStepNorm")
logger.log_tabular("Misc/xHx")
logger.log_tabular("Misc/gradient_norm")
logger.log_tabular("Misc/H_inv_g")
logger.log_tabular("Misc/AcceptanceStep")

logger.dump_tabular()
if (epoch+1) % 100 == 0 or epoch == 0:
logger.torch_save(itr=epoch)
if args.task not in isaac_gym_map.keys():
logger.save_state(
state_dict={
"Normalizer": env.obs_rms,
},
itr = epoch
)
if not logger.logged:
# log data
logger.log_tabular("Metrics/EpRet")
logger.log_tabular("Metrics/EpCost")
logger.log_tabular("Metrics/EpLen")
if args.use_eval:
logger.log_tabular("Metrics/EvalEpRet")
logger.log_tabular("Metrics/EvalEpCost")
logger.log_tabular("Metrics/EvalEpLen")
logger.log_tabular("Train/Epoch", epoch + 1)
logger.log_tabular("Train/TotalSteps", (epoch + 1) * args.steps_per_epoch)
logger.log_tabular("Train/KL")
logger.log_tabular("Loss/Loss_reward_critic")
logger.log_tabular("Loss/Loss_cost_critic")
logger.log_tabular("Loss/Loss_actor")
logger.log_tabular("Time/Rollout", rollout_end_time - rollout_start_time)
if args.use_eval:
logger.log_tabular("Time/Eval", eval_end_time - eval_start_time)
logger.log_tabular("Time/Update", update_end_time - eval_end_time)
logger.log_tabular("Time/Total", update_end_time - rollout_start_time)
logger.log_tabular("Value/RewardAdv", data["adv_r"].mean().item())
logger.log_tabular("Value/CostAdv", data["adv_c"].mean().item())
logger.log_tabular("Misc/Alpha")
logger.log_tabular("Misc/FinalStepNorm")
logger.log_tabular("Misc/xHx")
logger.log_tabular("Misc/gradient_norm")
logger.log_tabular("Misc/H_inv_g")
logger.log_tabular("Misc/AcceptanceStep")

logger.dump_tabular()
if (epoch+1) % 100 == 0 or epoch == 0:
logger.torch_save(itr=epoch)
if args.task not in isaac_gym_map.keys():
logger.save_state(
state_dict={
"Normalizer": env.obs_rms,
},
itr = epoch
)
logger.close()


Expand Down
13 changes: 8 additions & 5 deletions safepo/single_agent/cppo_pid.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
'use_value_coefficient': True,
'learning_iters': 8,
'max_grad_norm': 1.0,
'use_critic_norm': False,
}

def main(args, cfg_env=None):
Expand Down Expand Up @@ -91,8 +92,9 @@ def main(args, cfg_env=None):

# set training steps
steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch)
total_steps = config.get("total_steps", args.total_steps)
local_steps_per_epoch = steps_per_epoch // args.num_envs
epochs = args.total_steps // steps_per_epoch
epochs = total_steps // steps_per_epoch
# create the actor-critic module
policy = ActorVCritic(
obs_dim=obs_space.shape[0],
Expand Down Expand Up @@ -304,10 +306,11 @@ def main(args, cfg_env=None):
loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b)
cost_critic_optimizer.zero_grad()
loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b)
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
if config.get("use_critic_norm", True):
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
distribution = policy.actor(obs_b)
log_prob = distribution.log_prob(act_b).sum(dim=-1)
ratio = torch.exp(log_prob - log_prob_b)
Expand Down
13 changes: 8 additions & 5 deletions safepo/single_agent/cup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
'use_value_coefficient': True,
'learning_iters': 8,
'max_grad_norm': 1.0,
'use_critic_norm': False,
}

def main(args, cfg_env=None):
Expand Down Expand Up @@ -92,8 +93,9 @@ def main(args, cfg_env=None):

# set training steps
steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch)
total_steps = config.get("total_steps", args.total_steps)
local_steps_per_epoch = steps_per_epoch // args.num_envs
epochs = args.total_steps // steps_per_epoch
epochs = total_steps // steps_per_epoch
# create the actor-critic module
policy = ActorVCritic(
obs_dim=obs_space.shape[0],
Expand Down Expand Up @@ -307,10 +309,11 @@ def main(args, cfg_env=None):
loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b)
cost_critic_optimizer.zero_grad()
loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b)
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
if config.get("use_critic_norm", True):
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
distribution = policy.actor(obs_b)
log_prob = distribution.log_prob(act_b).sum(dim=-1)
ratio = torch.exp(log_prob - log_prob_b)
Expand Down
17 changes: 11 additions & 6 deletions safepo/single_agent/focops.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from safepo.common.model import ActorVCritic
from safepo.utils.config import single_agent_args, isaac_gym_map, parse_sim_params


default_cfg = {
'hidden_sizes': [64, 64],
'gamma': 0.99,
Expand All @@ -61,6 +62,7 @@
'use_value_coefficient': True,
'learning_iters': 8,
'max_grad_norm': 1.0,
'use_critic_norm': False,
}

def main(args, cfg_env=None):
Expand Down Expand Up @@ -91,8 +93,9 @@ def main(args, cfg_env=None):

# set training steps
steps_per_epoch = config.get("steps_per_epoch", args.steps_per_epoch)
total_steps = config.get("total_steps", args.total_steps)
local_steps_per_epoch = steps_per_epoch // args.num_envs
epochs = args.total_steps // steps_per_epoch
epochs = total_steps // steps_per_epoch
# create the actor-critic module
policy = ActorVCritic(
obs_dim=obs_space.shape[0],
Expand Down Expand Up @@ -312,10 +315,11 @@ def main(args, cfg_env=None):
loss_r = nn.functional.mse_loss(policy.reward_critic(obs_b), target_value_r_b)
cost_critic_optimizer.zero_grad()
loss_c = nn.functional.mse_loss(policy.cost_critic(obs_b), target_value_c_b)
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001
if config.get("use_critic_norm", True):
for param in policy.reward_critic.parameters():
loss_r += param.pow(2).sum() * 0.001
for param in policy.cost_critic.parameters():
loss_c += param.pow(2).sum() * 0.001

old_distribution_b = Normal(loc=old_mean_b, scale=old_std_b)
distribution = policy.actor(obs_b)
Expand All @@ -326,7 +330,8 @@ def main(args, cfg_env=None):
).sum(-1, keepdim=True)
loss_pi = (temp_kl - (1 / 1.5) * ratio * adv_b) * (
temp_kl.detach() <= 0.02
).type(torch.float32).mean()
).type(torch.float32)
loss_pi = loss_pi.mean()
actor_optimizer.zero_grad()
total_loss = loss_pi + 2*loss_r + loss_c \
if config.get("use_value_coefficient", False) \
Expand Down
Loading

0 comments on commit c74f540

Please sign in to comment.