diff --git a/.gitignore b/.gitignore index f8f04061..94f827de 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,8 @@ experiment.log # Pax pax/version.py + +*.gif +*.json +*.png +*.sh diff --git a/docs/getting-started/runners.md b/docs/getting-started/runners.md index 43dc241d..45d55770 100644 --- a/docs/getting-started/runners.md +++ b/docs/getting-started/runners.md @@ -23,6 +23,61 @@ In order for this approach to work the observation vector needs to include one e See [this experiment](https://github.com/akbir/pax/blob/9d3fa62e34279a338c07cffcbf208edc8a95e7ba/pax/conf/experiment/rice/weight_sharing.yaml) for an example of how to configure it. +## Evo Hardstop + +The Evo Runner optimizes the first agent using evolutionary learning. +This runner stops the learning of an opponent during training, corresponds to the hardstop challenge of Shaper. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Scanned + +The Evo Runner optimizes the first agent using evolutionary learning. +Here we also scan over the evolutionary steps, which makes compilation longer, training shorter and logging stats is not possible. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed LR Runner (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +This runner randomly samples learning rates for the opponents. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed Payoff (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +Payoff matrix is randomly sampled at each rollout. Each opponent has a different payoff matrix. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed Payoff Gen (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed IPD Payoff (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +This runner randomly samples payoffs that follow Iterated Prisoner's Dilemma [constraints](https://en.wikipedia.org/wiki/Prisoner%27s_dilemma). + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed Payoff Input (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix. The payoff matrix is observed as input to the agent. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. + +## Evo Mixed Payoff Only Opp (experimental) + +The Evo Runner optimizes the first agent using evolutionary learning. +Noise is added to the opponents IPD-like payout matrix at each rollout. Each opponent has the same noise added. + +See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it. diff --git a/hardstop_eval_bash.sh b/hardstop_eval_bash.sh new file mode 100755 index 00000000..0d7e1db8 --- /dev/null +++ b/hardstop_eval_bash.sh @@ -0,0 +1,24 @@ +#!/bin/bash +###### MFOS AVG ###### +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4ykf9oe8 ++model_path=exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/eopf93re ++model_path=exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1sqbd09n ++model_path=exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3n7l8ods ++model_path=exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4mf1ecxq ++model_path=exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 + +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4ykf9oe8 ++model_path=exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/eopf93re ++model_path=exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1sqbd09n ++model_path=exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3n7l8ods ++model_path=exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4mf1ecxq ++model_path=exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 + +###### Shaper Nothing #$$$ +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/2m3wh5g7 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1jk5zly5 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1cvpiolk ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3vml0wjy ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100 + +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/2m3wh5g7 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1jk5zly5 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1cvpiolk ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 +python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3vml0wjy ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 \ No newline at end of file diff --git a/pax/agents/mfos_ppo/networks.py b/pax/agents/mfos_ppo/networks.py index cb5397a1..5b3ea405 100644 --- a/pax/agents/mfos_ppo/networks.py +++ b/pax/agents/mfos_ppo/networks.py @@ -151,6 +151,28 @@ def forward_fn( network = hk.without_apply_rng(hk.transform(forward_fn)) return network, hidden_state +def make_mfos_avg_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, 3 * hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, + state: Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray], + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + mfos = ActorCriticMFOS(num_actions, hidden_size) + hidden_t, hidden_a, hidden_v = jnp.split(state, 3, axis=-1) + avg_hidden_t = jnp.mean(hidden_t, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + avg_hidden_a = jnp.mean(hidden_a, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + avg_hidden_v = jnp.mean(hidden_v, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + hidden_t = 0.5*hidden_t + 0.5*avg_hidden_t + hidden_a = 0.5*hidden_a + 0.5*avg_hidden_a + hidden_v = 0.5*hidden_v + 0.5*avg_hidden_v + state = jnp.concatenate([hidden_t, hidden_a, hidden_v], axis=-1) + logits, values, state = mfos(inputs, state) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + def make_mfos_continuous_network(num_actions: int, hidden_size: int): hidden_state = jnp.zeros((1, 3 * hidden_size)) diff --git a/pax/agents/mfos_ppo/ppo_gru.py b/pax/agents/mfos_ppo/ppo_gru.py index 2e130fa7..15e08860 100644 --- a/pax/agents/mfos_ppo/ppo_gru.py +++ b/pax/agents/mfos_ppo/ppo_gru.py @@ -12,6 +12,7 @@ from pax.agents.mfos_ppo.networks import ( make_mfos_ipditm_network, make_mfos_network, + make_mfos_avg_network, make_mfos_continuous_network, ) from pax.envs.rice.rice import Rice @@ -65,7 +66,6 @@ def __init__( obs_spec: Tuple, batch_size: int = 2000, num_envs: int = 4, - num_steps: int = 500, num_minibatches: int = 16, num_epochs: int = 4, clip_value: bool = True, @@ -481,8 +481,8 @@ def prepare_batch( # Other useful hyperparameters self._num_envs = num_envs # number of environments - self._num_steps = num_steps # number of steps per environment - self._batch_size = int(num_envs * num_steps) # number in one batch + # self._num_steps = num_steps # number of steps per environment + # self._batch_size = int(num_envs * num_steps) # number in one batch self._num_minibatches = num_minibatches # number of minibatches self._num_epochs = num_epochs # number of epochs to use sample self._gru_dim = gru_dim @@ -578,6 +578,17 @@ def make_mfos_agent( agent_args.output_channels, agent_args.kernel_shape, ) + elif args.env_id == "iterated_matrix_game": + if args.att_type=='att': + raise ValueError("Attention not supported") + elif args.att_type=='avg': + network, initial_hidden_state = make_mfos_avg_network( + action_spec, agent_args.hidden_size + ) + elif args.att_type=='nothing': + network, initial_hidden_state = make_mfos_network( + action_spec, agent_args.hidden_size + ) else: raise ValueError("Unsupported environment") @@ -620,7 +631,6 @@ def make_mfos_agent( obs_spec=obs_spec, batch_size=None, num_envs=args.num_envs, - num_steps=args.num_steps, num_minibatches=agent_args.num_minibatches, num_epochs=agent_args.num_epochs, clip_value=agent_args.clip_value, diff --git a/pax/agents/ppo/ppo.py b/pax/agents/ppo/ppo.py index 9a098846..459d6c01 100644 --- a/pax/agents/ppo/ppo.py +++ b/pax/agents/ppo/ppo.py @@ -506,6 +506,16 @@ def make_agent( agent_args.output_channels, agent_args.kernel_shape, ) + elif args.env_id in [ + "iterated_matrix_game", + "iterated_tensor_game", + "iterated_nplayer_tensor_game", + "third_party_punishment", + "third_party_random", + ]: + network = make_ipd_network( + action_spec, tabular, agent_args.hidden_size + ) elif args.env_id == "Cournot": network = make_cournot_network(action_spec, agent_args.hidden_size) elif args.env_id == "Fishery": @@ -534,6 +544,7 @@ def make_agent( ) if agent_args.lr_scheduling: + scale = optax.inject_hyperparams(optax.scale)(step_size=-1.0) scheduler = optax.linear_schedule( init_value=agent_args.learning_rate, end_value=0, @@ -543,15 +554,18 @@ def make_agent( optax.clip_by_global_norm(agent_args.max_gradient_norm), optax.scale_by_adam(eps=agent_args.adam_epsilon), optax.scale_by_schedule(scheduler), - optax.scale(-1), + scale, ) + # optimizer = optax.inject_hyperparams(optimizer)(learning_rate=agent_args.learning_rate) else: + scale = optax.inject_hyperparams(optax.scale)(step_size=-agent_args.learning_rate) optimizer = optax.chain( optax.clip_by_global_norm(agent_args.max_gradient_norm), optax.scale_by_adam(eps=agent_args.adam_epsilon), - optax.scale(-agent_args.learning_rate), + scale, ) + # optimizer = optax.inject_hyperparams(optimizer)(learning_rate=agent_args.learning_rate) # Random key random_key = jax.random.PRNGKey(seed=seed) diff --git a/pax/agents/shaper_att/networks.py b/pax/agents/shaper_att/networks.py new file mode 100644 index 00000000..a8396d8a --- /dev/null +++ b/pax/agents/shaper_att/networks.py @@ -0,0 +1,697 @@ +from typing import Optional, Tuple + +import distrax +import haiku as hk +import jax +import jax.numpy as jnp + +from pax import utils + + +class CategoricalValueHead(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1), + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHead_ipd(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHeadSeparate(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._action_body = hk.nets.MLP( + [64, 64], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + b_init=hk.initializers.Constant(0), + ) + self._value_body = hk.nets.MLP( + [64, 64], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1), + b_init=hk.initializers.Constant(0), + ) + + def __call__(self, inputs: jnp.ndarray): + # action_output, value_output = inputs + logits = self._action_body(inputs) + logits = self._logit_layer(logits) + + value = self._value_body(inputs) + value = jnp.squeeze(self._value_layer(value), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHeadSeparate_ipditm(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + hidden_size: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._action_body = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._value_body = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + b_init=hk.initializers.Constant(0), + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1.0), + b_init=hk.initializers.Constant(0), + ) + + def __call__(self, inputs: jnp.ndarray): + # action_output, value_output = inputs + logits = self._action_body(inputs) + logits = self._logit_layer(logits) + + value = self._value_body(inputs) + value = jnp.squeeze(self._value_layer(value), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class ContinuousValueHead(hk.Module): + """Network head that produces a continuous distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), # baseline + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1.0), # baseline + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.MultivariateNormalDiag(loc=logits), value) + + +class Tabular(hk.Module): + def __init__(self, num_values: int): + super().__init__(name="Tabular") + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + + def _input_to_onehot(input: jnp.ndarray): + chunks = jnp.array([9**3, 9**2, 9, 1], dtype=jnp.int32) + idx = input.nonzero(size=4)[0] + idx = jnp.mod(idx, 9) + idx = chunks * idx + idx = jnp.sum(idx) + return jax.nn.one_hot(idx, num_classes=6561) + + self.input_to_onehot = jax.vmap(_input_to_onehot) + + def __call__(self, inputs: jnp.ndarray): + inputs = self.input_to_onehot(inputs) + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + + return (distrax.Categorical(logits=logits), value) + + +class CNN(hk.Module): + def __init__(self, output_channels, kernel_shape): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + self.conv_a_1 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + self.linear_a_0 = hk.Linear(output_channels) + + self.flatten = hk.Flatten() + + def __call__(self, inputs: jnp.ndarray): + # Actor and Critic + x = self.conv_a_0(inputs) + x = jax.nn.relu(x) + x = self.conv_a_1(x) + x = jax.nn.relu(x) + x = self.flatten(x) + x = self.linear_a_0(x) + x = jax.nn.relu(x) + return x + + +class CNN_ipditm(hk.Module): + def __init__(self, output_channels, kernel_shape): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + ) + # akbir suggested fix + self.flatten = hk.Flatten() + + def __call__(self, inputs: jnp.ndarray): + obs = inputs["observation"] + inventory = inputs["inventory"] + # Actor and Critic + x = self.conv_a_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + return x + + +class CNNSeparate_ipditm(hk.Module): + def __init__(self, output_channels, kernel_shape, num_actions: int): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + ) + self.linear_a_0 = hk.Linear(output_channels) + self.conv_v_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + ) + self.linear_v_0 = hk.Linear(1) + self.flatten = hk.Flatten() + + def __call__(self, inputs): + obs = inputs["observation"] + inventory = inputs["inventory"] + # Actor + x = self.conv_a_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + logits = self.linear_a_0(x) + + # Critic + x = self.conv_v_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + x = self.linear_v_0(x) + val = x + return (distrax.Categorical(logits=logits), jnp.squeeze(val, axis=-1)) + + +def make_GRU_ipd_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + gru = hk.GRU(hidden_size) + embedding, state = gru(inputs, state) + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + +def make_GRU_ipd_avg_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + gru = hk.GRU(hidden_size) + old_state = state + # jax.debug.breakpoint() + state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + state = 0.5*state + 0.5*old_state + embedding, state = gru(inputs, state) + + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + +def make_GRU_ipd_att_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + # print(state.shape, 'STATE shape') + gru = hk.GRU(hidden_size) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 1 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)), + # w_init=hk.initializers.Constant(0.5), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)), + # w_init=hk.initializers.Constant(0.5), + b_init=hk.initializers.Constant(0), + # with_bias=False, + ) + old_state = state + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state_attn + state) + state = shape_mlp(state) + state = 0.5*old_state + 0.5*state + embedding, state = gru(inputs, state) + + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + + +def make_GRU_cartpole_network(num_actions: int): + hidden_size = 256 + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + torso = hk.nets.MLP( + [hidden_size, hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU(hidden_size) + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + + +def make_GRU_coingame_network( + num_actions: int, + with_cnn: bool, + hidden_size: int, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + + if with_cnn: + torso = CNN(output_channels, kernel_shape)(inputs) + + else: + torso = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU( + hidden_size, + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_coingame_att_network( + num_actions: int, + with_cnn: bool, + hidden_size: int, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + + if with_cnn: + torso = CNN(output_channels, kernel_shape)(inputs) + + else: + torso = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU( + hidden_size, + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 8 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + + embedding = torso(inputs) + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state + state_attn) + state = shape_mlp(state) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + + +def make_GRU_ipditm_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_ipditm_att_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + + # input_shape = [num_opps, num_envs, obs_spec...] + # num_opps is our true batch size + # num_envs is actually part of our featuer space + # lets use attention network to over the hidden_states + + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 8 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + + # shaper network to obfuscated + print("state", state.shape) + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state + state_attn) + state = shape_mlp(state) + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_ipditm_avg_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + + # input_shape = [num_opps, num_envs, obs_spec...] + # num_opps is our true batch size + # num_envs is actually part of our featuer space + # lets use attention network to over the hidden_states + + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + old_state = state + state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + state = 0.5*state + 0.5*old_state + + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + + # shaper network to obfuscated + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + + +def test_GRU(): + key = jax.random.PRNGKey(seed=0) + num_actions = 2 + obs_spec = (5,) + key, subkey = jax.random.split(key) + dummy_obs = jnp.zeros(shape=obs_spec) + dummy_obs = utils.add_batch_dim(dummy_obs) + network, hidden = make_GRU_ipd_network(num_actions) + print(hidden.shape) + initial_params = network.init(subkey, dummy_obs, hidden) + print("GRU w_i", initial_params["gru"]["w_i"].shape) + print("GRU w_h", initial_params["gru"]["w_h"].shape) + print( + "Policy head", + initial_params["categorical_value_head/~/linear"]["w"].shape, + ) + print( + "Value head", + initial_params["categorical_value_head/~/linear_1"]["w"].shape, + ) + observation = jnp.zeros(shape=(1, 5)) + observation = jnp.zeros(shape=(10, 5)) + (logits, values), hidden = network.apply( + initial_params, observation, hidden + ) + print(hidden.shape) + return network + + +if __name__ == "__main__": + test_GRU() diff --git a/pax/agents/shaper_att/ppo_gru.py b/pax/agents/shaper_att/ppo_gru.py new file mode 100644 index 00000000..87b0056a --- /dev/null +++ b/pax/agents/shaper_att/ppo_gru.py @@ -0,0 +1,610 @@ +# Adapted from https://github.com/deepmind/acme/blob/master/acme/agents/jax/ppo/learning.py + +from typing import Any, Dict, NamedTuple, Tuple + +import haiku as hk +import jax +import jax.numpy as jnp +import optax + +from pax import utils +from pax.agents.agent import AgentInterface +from pax.agents.shaper_att.networks import ( + make_GRU_cartpole_network, + make_GRU_coingame_att_network, + make_GRU_ipd_network, + make_GRU_ipd_avg_network, + make_GRU_ipd_att_network, + make_GRU_ipditm_att_network, + make_GRU_ipditm_avg_network, +) +from pax.utils import MemoryState, TrainingState, get_advantages + +# from dm_env import TimeStep + + +class Batch(NamedTuple): + """A batch of data; all shapes are expected to be [B, ...].""" + + observations: jnp.ndarray + actions: jnp.ndarray + advantages: jnp.ndarray + + # Target value estimate used to bootstrap the value function. + target_values: jnp.ndarray + + # Value estimate and action log-prob at behavior time. + behavior_values: jnp.ndarray + behavior_log_probs: jnp.ndarray + + # GRU specific + hiddens: jnp.ndarray + + +class Logger: + metrics: dict + + +class PPO(AgentInterface): + """A simple PPO agent with memory using JAX""" + + def __init__( + self, + network: NamedTuple, + initial_hidden_state: jnp.ndarray, + optimizer: optax.GradientTransformation, + random_key: jnp.ndarray, + gru_dim: int, + obs_spec: Tuple, + num_envs: int = 4, + num_minibatches: int = 16, + num_epochs: int = 4, + clip_value: bool = True, + value_coeff: float = 0.5, + anneal_entropy: bool = False, + entropy_coeff_start: float = 0.1, + entropy_coeff_end: float = 0.01, + entropy_coeff_horizon: int = 3_000_000, + ppo_clipping_epsilon: float = 0.2, + gamma: float = 0.99, + gae_lambda: float = 0.95, + player_id: int = 0, + ): + @jax.jit + def policy( + state: TrainingState, observation: jnp.ndarray, mem: MemoryState + ): + """Agent policy to select actions and calculate agent specific information""" + key, subkey = jax.random.split(state.random_key) + (dist, values), hidden_state = network.apply( + state.params, observation, mem.hidden + ) + + actions = dist.sample(seed=subkey) + mem.extras["values"] = values + mem.extras["log_probs"] = dist.log_prob(actions) + mem = mem._replace(hidden=hidden_state, extras=mem.extras) + state = state._replace(random_key=key) + return ( + actions, + state, + mem, + ) + + @jax.jit + def gae_advantages( + rewards: jnp.ndarray, values: jnp.ndarray, dones: jnp.ndarray + ) -> jnp.ndarray: + """Calculates the gae advantages from a sequence. Note that the + arguments are of length = rollout length + 1""" + # 'Zero out' the terminated states + discounts = gamma * jnp.logical_not(dones) + reverse_batch = ( + jnp.flip(values[:-1], axis=0), + jnp.flip(rewards, axis=0), + jnp.flip(discounts, axis=0), + ) + + _, advantages = jax.lax.scan( + get_advantages, + ( + jnp.zeros_like(values[-1]), + values[-1], + jnp.ones_like(values[-1]) * gae_lambda, + ), + reverse_batch, + ) + + advantages = jnp.flip(advantages, axis=0) + target_values = values[:-1] + advantages # Q-value estimates + target_values = jax.lax.stop_gradient(target_values) + return advantages, target_values + + def loss( + params: hk.Params, + timesteps: int, + observations: jnp.ndarray, + actions: jnp.array, + behavior_log_probs: jnp.array, + target_values: jnp.array, + advantages: jnp.array, + behavior_values: jnp.array, + hiddens: jnp.ndarray, + ): + """Surrogate loss using clipped probability ratios.""" + (distribution, values), _ = network.apply( + params, observations, hiddens + ) + + log_prob = distribution.log_prob(actions) + entropy = distribution.entropy() + + # Compute importance sampling weights: current policy / behavior policy. + rhos = jnp.exp(log_prob - behavior_log_probs) + + # Policy loss: Clipping + clipped_ratios_t = jnp.clip( + rhos, 1.0 - ppo_clipping_epsilon, 1.0 + ppo_clipping_epsilon + ) + clipped_objective = jnp.fmin( + rhos * advantages, clipped_ratios_t * advantages + ) + policy_loss = -jnp.mean(clipped_objective) + + # Value loss: MSE + value_cost = value_coeff + unclipped_value_error = target_values - values + unclipped_value_loss = unclipped_value_error**2 + + # Value clipping + if clip_value: + # Clip values to reduce variablility during critic training. + clipped_values = behavior_values + jnp.clip( + values - behavior_values, + -ppo_clipping_epsilon, + ppo_clipping_epsilon, + ) + clipped_value_error = target_values - clipped_values + clipped_value_loss = clipped_value_error**2 + value_loss = jnp.mean( + jnp.fmax(unclipped_value_loss, clipped_value_loss) + ) + else: + value_loss = jnp.mean(unclipped_value_loss) + + # Entropy loss: Standard entropy term + # Calculate the new value based on linear annealing formula + if anneal_entropy: + fraction = jnp.fmax(1 - timesteps / entropy_coeff_horizon, 0) + entropy_cost = ( + fraction * entropy_coeff_start + + (1 - fraction) * entropy_coeff_end + ) + # Constant Entropy term + else: + entropy_cost = entropy_coeff_start + entropy_loss = -jnp.mean(entropy) + + # Total loss: Minimize policy and value loss; maximize entropy + total_loss = ( + policy_loss + + entropy_cost * entropy_loss + + value_loss * value_cost + ) + + return total_loss, { + "loss_total": total_loss, + "loss_policy": policy_loss, + "loss_value": value_loss, + "loss_entropy": entropy_loss, + "entropy_cost": entropy_cost, + } + + @jax.jit + def sgd_step( + state: TrainingState, sample: NamedTuple + ) -> Tuple[TrainingState, Dict[str, jnp.ndarray]]: + """Performs a minibatch SGD step, returning new state and metrics.""" + # Extract data + ( + observations, + actions, + rewards, + behavior_log_probs, + behavior_values, + dones, + hiddens, + ) = ( + sample.observations, + sample.actions, + sample.rewards, + sample.behavior_log_probs, + sample.behavior_values, + sample.dones, + sample.hiddens, + ) + + # batch_gae_advantages = jax.vmap(gae_advantages, 1, (0, 0)) + advantages, target_values = gae_advantages( + rewards=rewards, values=behavior_values, dones=dones + ) + + # Exclude the last step - it was only used for bootstrapping. + # The shape is [num_steps, num_envs, ..] + behavior_values = behavior_values[:-1, :] + trajectories = Batch( + observations=observations, + actions=actions, + advantages=advantages, + behavior_log_probs=behavior_log_probs, + target_values=target_values, + behavior_values=behavior_values, + hiddens=hiddens, + ) + # Concatenate all trajectories. Reshape from [num_envs, num_steps, ..] + # to [num_envs * num_steps,..] + assert len(target_values.shape) > 1 + num_envs = target_values.shape[1] + num_steps = target_values.shape[0] + batch_size = num_envs * num_steps + assert batch_size % num_minibatches == 0, ( + "Num minibatches must divide batch size. Got batch_size={}" + " num_minibatches={}." + ).format(batch_size, num_minibatches) + + batch = jax.tree_util.tree_map( + lambda x: x.reshape((batch_size,) + x.shape[2:]), trajectories + ) + # Compute gradients. + grad_fn = jax.jit(jax.grad(loss, has_aux=True)) + + def model_update_minibatch( + carry: Tuple[hk.Params, optax.OptState, int], + minibatch: Batch, + ) -> Tuple[ + Tuple[hk.Params, optax.OptState, int], Dict[str, jnp.ndarray] + ]: + """Performs model update for a single minibatch.""" + params, opt_state, timesteps = carry + # Normalize advantages at the minibatch level before using them. + advantages = ( + minibatch.advantages + - jnp.mean(minibatch.advantages, axis=0) + ) / (jnp.std(minibatch.advantages, axis=0) + 1e-8) + gradients, metrics = grad_fn( + params, + timesteps, + minibatch.observations, + minibatch.actions, + minibatch.behavior_log_probs, + minibatch.target_values, + advantages, + minibatch.behavior_values, + minibatch.hiddens, + ) + + # Apply updates + updates, opt_state = optimizer.update(gradients, opt_state) + params = optax.apply_updates(params, updates) + + metrics["norm_grad"] = optax.global_norm(gradients) + metrics["norm_updates"] = optax.global_norm(updates) + return (params, opt_state, timesteps), metrics + + def model_update_epoch( + carry: Tuple[ + jnp.ndarray, hk.Params, optax.OptState, int, Batch + ], + unused_t: Tuple[()], + ) -> Tuple[ + Tuple[jnp.ndarray, hk.Params, optax.OptState, Batch], + Dict[str, jnp.ndarray], + ]: + """Performs model updates based on one epoch of data.""" + key, params, opt_state, timesteps, batch = carry + key, subkey = jax.random.split(key) + permutation = jax.random.permutation(subkey, batch_size) + shuffled_batch = jax.tree_util.tree_map( + lambda x: jnp.take(x, permutation, axis=0), batch + ) + minibatches = jax.tree_util.tree_map( + lambda x: jnp.reshape( + x, [num_minibatches, -1] + list(x.shape[1:]) + ), + shuffled_batch, + ) + + (params, opt_state, timesteps), metrics = jax.lax.scan( + model_update_minibatch, + (params, opt_state, timesteps), + minibatches, + length=num_minibatches, + ) + return (key, params, opt_state, timesteps, batch), metrics + + params = state.params + opt_state = state.opt_state + timesteps = state.timesteps + + # Repeat training for the given number of epoch, taking a random + # permutation for every epoch. + # signature is scan(function, carry, tuple to iterate over, length) + (key, params, opt_state, timesteps, _), metrics = jax.lax.scan( + model_update_epoch, + (state.random_key, params, opt_state, timesteps, batch), + (), + length=num_epochs, + ) + + metrics = jax.tree_util.tree_map(jnp.mean, metrics) + metrics["rewards_mean"] = jnp.mean( + jnp.abs(jnp.mean(rewards, axis=(0, 1))) + ) + metrics["rewards_std"] = jnp.std(rewards, axis=(0, 1)) + + # Reset the memory + new_state = TrainingState( + params=params, + opt_state=opt_state, + random_key=key, + timesteps=timesteps + batch_size, + ) + + new_memory = MemoryState( + hidden=jnp.zeros(shape=(self._num_envs,) + (gru_dim,)), + extras={ + "log_probs": jnp.zeros(self._num_envs), + "values": jnp.zeros(self._num_envs), + }, + ) + + return new_state, new_memory, metrics + + def make_initial_state( + key: Any, initial_hidden_state: jnp.ndarray + ) -> TrainingState: + """Initialises the training state (parameters and optimiser state).""" + + # We pass through initial_hidden_state so its easy to batch memory + key, subkey = jax.random.split(key) + + if isinstance(obs_spec, dict): + dummy_obs = {} + for k, v in obs_spec.items(): + dummy_obs[k] = jnp.zeros(shape=v) + + else: + dummy_obs = jnp.zeros(shape=obs_spec) + dummy_obs = utils.add_batch_dim(dummy_obs) + initial_params = network.init( + subkey, dummy_obs, initial_hidden_state + ) + initial_opt_state = optimizer.init(initial_params) + return TrainingState( + random_key=key, + params=initial_params, + opt_state=initial_opt_state, + timesteps=0, + ), MemoryState( + hidden=jnp.zeros( + (num_envs, initial_hidden_state.shape[-1]) + ), # initial_hidden_state, + extras={ + "values": jnp.zeros(num_envs), + "log_probs": jnp.zeros(num_envs), + }, + ) + + # @jax.jit + def prepare_batch( + traj_batch: NamedTuple, + done: Any, + action_extras: dict, + ): + # Rollouts complete -> Training begins + # Add an additional rollout step for advantage calculation + _value = jax.lax.select( + done, + jnp.zeros_like(action_extras["values"]), + action_extras["values"], + ) + + _value = jax.lax.expand_dims(_value, [0]) + + # need to add final value here + traj_batch = traj_batch._replace( + behavior_values=jnp.concatenate( + [traj_batch.behavior_values, _value], axis=0 + ) + ) + return traj_batch + + # Initialise training state (parameters, optimiser state, extras). + self._state, self._mem = make_initial_state( + random_key, initial_hidden_state + ) + + self.make_initial_state = make_initial_state + + self._prepare_batch = prepare_batch + self._sgd_step = jax.jit(sgd_step) + + # Set up counters and logger + self._logger = Logger() + self._total_steps = 0 + self._until_sgd = 0 + self._logger.metrics = { + "total_steps": 0, + "sgd_steps": 0, + "loss_total": 0, + "loss_policy": 0, + "loss_value": 0, + "loss_entropy": 0, + "entropy_cost": entropy_coeff_start, + } + + # Initialize functions + self._policy = policy + self.forward = network.apply + self.player_id = player_id + + # Other useful hyperparameters + self._num_envs = num_envs # number of environments + self._num_minibatches = num_minibatches # number of minibatches + self._num_epochs = num_epochs # number of epochs to use sample + self._gru_dim = gru_dim + + def reset_memory(self, memory, eval=False) -> TrainingState: + num_envs = 1 if eval else self._num_envs + memory = memory._replace( + extras={ + "values": jnp.zeros(num_envs), + "log_probs": jnp.zeros(num_envs), + }, + hidden=jnp.zeros((num_envs, self._gru_dim)), + ) + return memory + + def update( + self, + traj_batch: NamedTuple, + obs: jnp.ndarray, + state: TrainingState, + mem: MemoryState, + ): + + """Update the agent -> only called at the end of a trajectory""" + + _, _, mem = self._policy(state, obs, mem) + traj_batch = self._prepare_batch( + traj_batch, traj_batch.dones[-1, ...], mem.extras + ) + state, mem, metrics = self._sgd_step(state, traj_batch) + + # update logging + + self._logger.metrics["sgd_steps"] += ( + self._num_minibatches * self._num_epochs + ) + self._logger.metrics["loss_total"] = metrics["loss_total"] + self._logger.metrics["loss_policy"] = metrics["loss_policy"] + self._logger.metrics["loss_value"] = metrics["loss_value"] + self._logger.metrics["loss_entropy"] = metrics["loss_entropy"] + self._logger.metrics["entropy_cost"] = metrics["entropy_cost"] + return state, mem, metrics + + +# TODO: seed, and player_id not used in CartPole +def make_shaper_agent( + args, + agent_args, + obs_spec, + action_spec, + seed: int, + num_iterations: int, + player_id: int, +): + """Make PPO agent""" + # Network + if args.env_id == "CartPole-v1": + network, initial_hidden_state = make_GRU_cartpole_network(action_spec) + elif args.env_id == "coin_game": + network, initial_hidden_state = make_GRU_coingame_att_network( + action_spec, + agent_args.with_cnn, + agent_args.hidden_size, + agent_args.output_channels, + agent_args.kernel_shape, + ) + elif args.env_id == "iterated_matrix_game": + if args.att_type=='att': + network, initial_hidden_state = make_GRU_ipd_att_network( + action_spec, agent_args.hidden_size + ) + elif args.att_type=='avg': + network, initial_hidden_state = make_GRU_ipd_avg_network( + action_spec, agent_args.hidden_size + ) + elif args.att_type=='nothing': + network, initial_hidden_state = make_GRU_ipd_network( + action_spec, agent_args.hidden_size + ) + + elif args.env_id == "InTheMatrix": + if args.att_type=='avg': + network, initial_hidden_state = make_GRU_ipditm_avg_network( + action_spec, + agent_args.hidden_size, + agent_args.separate, + agent_args.output_channels, + agent_args.kernel_shape, + ) + if args.att_type=='att': + network, initial_hidden_state = make_GRU_ipditm_att_network( + action_spec, + agent_args.hidden_size, + agent_args.separate, + agent_args.output_channels, + agent_args.kernel_shape, + ) + + gru_dim = initial_hidden_state.shape[1] + + initial_hidden_state = jnp.zeros( + (args.num_envs, initial_hidden_state.shape[1]) + ) + + # Optimizer + transition_steps = ( + num_iterations * agent_args.num_epochs * agent_args.num_minibatches + ) + + if agent_args.lr_scheduling: + scheduler = optax.linear_schedule( + init_value=agent_args.learning_rate, + end_value=0, + transition_steps=transition_steps, + ) + optimizer = optax.chain( + optax.clip_by_global_norm(agent_args.max_gradient_norm), + optax.scale_by_adam(eps=agent_args.adam_epsilon), + optax.scale_by_schedule(scheduler), + optax.scale(-1), + ) + + else: + optimizer = optax.chain( + optax.clip_by_global_norm(agent_args.max_gradient_norm), + optax.scale_by_adam(eps=agent_args.adam_epsilon), + optax.scale(-agent_args.learning_rate), + ) + + # Random key + random_key = jax.random.PRNGKey(seed=seed) + + agent = PPO( + network=network, + initial_hidden_state=initial_hidden_state, + optimizer=optimizer, + random_key=random_key, + gru_dim=gru_dim, + obs_spec=obs_spec, + num_envs=args.num_envs, + num_minibatches=agent_args.num_minibatches, + num_epochs=agent_args.num_epochs, + clip_value=agent_args.clip_value, + value_coeff=agent_args.value_coeff, + anneal_entropy=agent_args.anneal_entropy, + entropy_coeff_start=agent_args.entropy_coeff_start, + entropy_coeff_end=agent_args.entropy_coeff_end, + entropy_coeff_horizon=agent_args.entropy_coeff_horizon, + ppo_clipping_epsilon=agent_args.ppo_clipping_epsilon, + gamma=agent_args.gamma, + gae_lambda=agent_args.gae_lambda, + player_id=player_id, + ) + return agent + + +if __name__ == "__main__": + pass diff --git a/pax/agents/shaper_pred/networks.py b/pax/agents/shaper_pred/networks.py new file mode 100644 index 00000000..faa6b6db --- /dev/null +++ b/pax/agents/shaper_pred/networks.py @@ -0,0 +1,696 @@ +from typing import Optional, Tuple + +import distrax +import haiku as hk +import jax +import jax.numpy as jnp + +from pax import utils + + +class CategoricalValueHead(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1), + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHead_ipd(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHeadSeparate(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._action_body = hk.nets.MLP( + [64, 64], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + b_init=hk.initializers.Constant(0), + ) + self._value_body = hk.nets.MLP( + [64, 64], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1), + b_init=hk.initializers.Constant(0), + ) + + def __call__(self, inputs: jnp.ndarray): + # action_output, value_output = inputs + logits = self._action_body(inputs) + logits = self._logit_layer(logits) + + value = self._value_body(inputs) + value = jnp.squeeze(self._value_layer(value), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class CategoricalValueHeadSeparate_ipditm(hk.Module): + """Network head that produces a categorical distribution and value.""" + + def __init__( + self, + num_values: int, + hidden_size: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._action_body = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._value_body = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + activation=jnp.tanh, + ) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), + b_init=hk.initializers.Constant(0), + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1.0), + b_init=hk.initializers.Constant(0), + ) + + def __call__(self, inputs: jnp.ndarray): + # action_output, value_output = inputs + logits = self._action_body(inputs) + logits = self._logit_layer(logits) + + value = self._value_body(inputs) + value = jnp.squeeze(self._value_layer(value), axis=-1) + return (distrax.Categorical(logits=logits), value) + + +class ContinuousValueHead(hk.Module): + """Network head that produces a continuous distribution and value.""" + + def __init__( + self, + num_values: int, + name: Optional[str] = None, + ): + super().__init__(name=name) + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Orthogonal(0.01), # baseline + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Orthogonal(1.0), # baseline + with_bias=False, + ) + + def __call__(self, inputs: jnp.ndarray): + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + return (distrax.MultivariateNormalDiag(loc=logits), value) + + +class Tabular(hk.Module): + def __init__(self, num_values: int): + super().__init__(name="Tabular") + self._logit_layer = hk.Linear( + num_values, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + self._value_layer = hk.Linear( + 1, + w_init=hk.initializers.Constant(0.5), + with_bias=False, + ) + + def _input_to_onehot(input: jnp.ndarray): + chunks = jnp.array([9**3, 9**2, 9, 1], dtype=jnp.int32) + idx = input.nonzero(size=4)[0] + idx = jnp.mod(idx, 9) + idx = chunks * idx + idx = jnp.sum(idx) + return jax.nn.one_hot(idx, num_classes=6561) + + self.input_to_onehot = jax.vmap(_input_to_onehot) + + def __call__(self, inputs: jnp.ndarray): + inputs = self.input_to_onehot(inputs) + logits = self._logit_layer(inputs) + value = jnp.squeeze(self._value_layer(inputs), axis=-1) + + return (distrax.Categorical(logits=logits), value) + + +class CNN(hk.Module): + def __init__(self, output_channels, kernel_shape): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + self.conv_a_1 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + self.linear_a_0 = hk.Linear(output_channels) + + self.flatten = hk.Flatten() + + def __call__(self, inputs: jnp.ndarray): + # Actor and Critic + x = self.conv_a_0(inputs) + x = jax.nn.relu(x) + x = self.conv_a_1(x) + x = jax.nn.relu(x) + x = self.flatten(x) + x = self.linear_a_0(x) + x = jax.nn.relu(x) + return x + + +class CNN_ipditm(hk.Module): + def __init__(self, output_channels, kernel_shape): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + ) + # akbir suggested fix + self.flatten = hk.Flatten() + + def __call__(self, inputs: jnp.ndarray): + obs = inputs["observation"] + inventory = inputs["inventory"] + # Actor and Critic + x = self.conv_a_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + return x + + +class CNNSeparate_ipditm(hk.Module): + def __init__(self, output_channels, kernel_shape, num_actions: int): + super().__init__(name="CNN") + self.conv_a_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + ) + self.linear_a_0 = hk.Linear(output_channels) + self.conv_v_0 = hk.Conv2D( + output_channels=output_channels, + kernel_shape=kernel_shape, + stride=1, + padding="SAME", + ) + self.linear_v_0 = hk.Linear(1) + self.flatten = hk.Flatten() + + def __call__(self, inputs): + obs = inputs["observation"] + inventory = inputs["inventory"] + # Actor + x = self.conv_a_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + logits = self.linear_a_0(x) + + # Critic + x = self.conv_v_0(obs) + x = jax.nn.relu(x) + x = self.flatten(x) + x = jnp.concatenate([x, inventory], axis=-1) + x = self.linear_v_0(x) + val = x + return (distrax.Categorical(logits=logits), jnp.squeeze(val, axis=-1)) + + +def make_GRU_ipd_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + gru = hk.GRU(hidden_size) + embedding, state = gru(inputs, state) + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + +def make_GRU_ipd_avg_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + gru = hk.GRU(hidden_size) + old_state = state + state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + state = 0.5*state + 0.5*old_state + embedding, state = gru(inputs, state) + + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + +def make_GRU_ipd_att_network(num_actions: int, hidden_size: int): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + print(state.shape, 'STATE shape') + gru = hk.GRU(hidden_size) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 1 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)), + # w_init=hk.initializers.Constant(0.5), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)), + # w_init=hk.initializers.Constant(0.5), + b_init=hk.initializers.Constant(0), + # with_bias=False, + ) + old_state = state + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state_attn + state) + state = shape_mlp(state) + state = 0.5*old_state + 0.5*state + embedding, state = gru(inputs, state) + + logits, values = CategoricalValueHead_ipd(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + + +def make_GRU_cartpole_network(num_actions: int): + hidden_size = 256 + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + torso = hk.nets.MLP( + [hidden_size, hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU(hidden_size) + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + + return network, hidden_state + + +def make_GRU_coingame_network( + num_actions: int, + with_cnn: bool, + hidden_size: int, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + + if with_cnn: + torso = CNN(output_channels, kernel_shape)(inputs) + + else: + torso = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU( + hidden_size, + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_coingame_att_network( + num_actions: int, + with_cnn: bool, + hidden_size: int, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + + if with_cnn: + torso = CNN(output_channels, kernel_shape)(inputs) + + else: + torso = hk.nets.MLP( + [hidden_size], + w_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + activate_final=True, + ) + gru = hk.GRU( + hidden_size, + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)), + b_init=hk.initializers.Constant(0), + ) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 8 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + + embedding = torso(inputs) + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state + state_attn) + state = shape_mlp(state) + embedding, state = gru(embedding, state) + logits, values = CategoricalValueHead(num_actions)(embedding) + + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + + +def make_GRU_ipditm_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_ipditm_att_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + + # input_shape = [num_opps, num_envs, obs_spec...] + # num_opps is our true batch size + # num_envs is actually part of our featuer space + # lets use attention network to over the hidden_states + + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + layer_norm1 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + + num_heads = 8 + shape_attn = hk.MultiHeadAttention( + num_heads=num_heads, + key_size=hidden_size // num_heads, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + ) + + layer_norm2 = hk.LayerNorm( + axis=-2, create_scale=True, create_offset=True + ) + shape_mlp = hk.Linear( + hidden_size, + w_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + + # shaper network to obfuscated + print("state", state.shape) + state_attn = layer_norm1(state) + state_attn = shape_attn(state_attn, state_attn, state_attn) + state = layer_norm2(state + state_attn) + state = shape_mlp(state) + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + +def make_GRU_ipditm_avg_network( + num_actions: int, + hidden_size: int, + separate: bool, + output_channels: int, + kernel_shape: Tuple[int], +): + hidden_state = jnp.zeros((1, hidden_size)) + + def forward_fn( + inputs: jnp.ndarray, state: jnp.ndarray + ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]: + """forward function""" + + # input_shape = [num_opps, num_envs, obs_spec...] + # num_opps is our true batch size + # num_envs is actually part of our featuer space + # lets use attention network to over the hidden_states + + torso = CNN_ipditm(output_channels, kernel_shape) + gru = hk.GRU( + hidden_size, + w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)), + b_init=hk.initializers.Constant(0), + ) + old_state = state + state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0) + state = 0.5*state + 0.5*old_state + + if separate: + cvh = CategoricalValueHeadSeparate_ipditm( + num_values=num_actions, hidden_size=hidden_size + ) + else: + cvh = CategoricalValueHead(num_values=num_actions) + embedding = torso(inputs) + + # shaper network to obfuscated + embedding, state = gru(embedding, state) + logits, values = cvh(embedding) + return (logits, values), state + + network = hk.without_apply_rng(hk.transform(forward_fn)) + return network, hidden_state + + +def test_GRU(): + key = jax.random.PRNGKey(seed=0) + num_actions = 2 + obs_spec = (5,) + key, subkey = jax.random.split(key) + dummy_obs = jnp.zeros(shape=obs_spec) + dummy_obs = utils.add_batch_dim(dummy_obs) + network, hidden = make_GRU_ipd_network(num_actions) + print(hidden.shape) + initial_params = network.init(subkey, dummy_obs, hidden) + print("GRU w_i", initial_params["gru"]["w_i"].shape) + print("GRU w_h", initial_params["gru"]["w_h"].shape) + print( + "Policy head", + initial_params["categorical_value_head/~/linear"]["w"].shape, + ) + print( + "Value head", + initial_params["categorical_value_head/~/linear_1"]["w"].shape, + ) + observation = jnp.zeros(shape=(1, 5)) + observation = jnp.zeros(shape=(10, 5)) + (logits, values), hidden = network.apply( + initial_params, observation, hidden + ) + print(hidden.shape) + return network + + +if __name__ == "__main__": + test_GRU() diff --git a/pax/agents/shaper_pred/ppo_gru.py b/pax/agents/shaper_pred/ppo_gru.py new file mode 100644 index 00000000..1de35b2d --- /dev/null +++ b/pax/agents/shaper_pred/ppo_gru.py @@ -0,0 +1,611 @@ +# Adapted from https://github.com/deepmind/acme/blob/master/acme/agents/jax/ppo/learning.py + +from typing import Any, Dict, NamedTuple, Tuple + +import haiku as hk +import jax +import jax.numpy as jnp +import optax + +from pax import utils +from pax.agents.agent import AgentInterface +from pax.agents.shaper_att.networks import ( + make_GRU_cartpole_network, + make_GRU_coingame_att_network, + make_GRU_ipd_network, + make_GRU_ipd_avg_network, + make_GRU_ipd_att_network, + make_GRU_ipditm_att_network, + make_GRU_ipditm_avg_network, +) +from pax.utils import MemoryState, TrainingState, get_advantages + +# from dm_env import TimeStep + + +class Batch(NamedTuple): + """A batch of data; all shapes are expected to be [B, ...].""" + + observations: jnp.ndarray + actions: jnp.ndarray + advantages: jnp.ndarray + + # Target value estimate used to bootstrap the value function. + target_values: jnp.ndarray + + # Value estimate and action log-prob at behavior time. + behavior_values: jnp.ndarray + behavior_log_probs: jnp.ndarray + + # GRU specific + hiddens: jnp.ndarray + + +class Logger: + metrics: dict + + +class PPO(AgentInterface): + """A simple PPO agent with memory using JAX""" + + def __init__( + self, + network: NamedTuple, + initial_hidden_state: jnp.ndarray, + optimizer: optax.GradientTransformation, + random_key: jnp.ndarray, + gru_dim: int, + obs_spec: Tuple, + num_envs: int = 4, + num_minibatches: int = 16, + num_epochs: int = 4, + clip_value: bool = True, + value_coeff: float = 0.5, + anneal_entropy: bool = False, + entropy_coeff_start: float = 0.1, + entropy_coeff_end: float = 0.01, + entropy_coeff_horizon: int = 3_000_000, + ppo_clipping_epsilon: float = 0.2, + gamma: float = 0.99, + gae_lambda: float = 0.95, + player_id: int = 0, + ): + @jax.jit + def policy( + state: TrainingState, observation: jnp.ndarray, mem: MemoryState + ): + """Agent policy to select actions and calculate agent specific information""" + key, subkey = jax.random.split(state.random_key) + (dist, values, pred), hidden_state = network.apply( + state.params, observation, mem.hidden + ) + + actions = dist.sample(seed=subkey) + mem.extras["values"] = values + mem.extras["log_probs"] = dist.log_prob(actions) + mem = mem._replace(hidden=hidden_state, extras=mem.extras) + state = state._replace(random_key=key) + return ( + actions, + pred, + state, + mem, + ) + + @jax.jit + def gae_advantages( + rewards: jnp.ndarray, values: jnp.ndarray, dones: jnp.ndarray + ) -> jnp.ndarray: + """Calculates the gae advantages from a sequence. Note that the + arguments are of length = rollout length + 1""" + # 'Zero out' the terminated states + discounts = gamma * jnp.logical_not(dones) + reverse_batch = ( + jnp.flip(values[:-1], axis=0), + jnp.flip(rewards, axis=0), + jnp.flip(discounts, axis=0), + ) + + _, advantages = jax.lax.scan( + get_advantages, + ( + jnp.zeros_like(values[-1]), + values[-1], + jnp.ones_like(values[-1]) * gae_lambda, + ), + reverse_batch, + ) + + advantages = jnp.flip(advantages, axis=0) + target_values = values[:-1] + advantages # Q-value estimates + target_values = jax.lax.stop_gradient(target_values) + return advantages, target_values + + def loss( + params: hk.Params, + timesteps: int, + observations: jnp.ndarray, + actions: jnp.array, + behavior_log_probs: jnp.array, + target_values: jnp.array, + advantages: jnp.array, + behavior_values: jnp.array, + hiddens: jnp.ndarray, + ): + """Surrogate loss using clipped probability ratios.""" + (distribution, values), _ = network.apply( + params, observations, hiddens + ) + + log_prob = distribution.log_prob(actions) + entropy = distribution.entropy() + + # Compute importance sampling weights: current policy / behavior policy. + rhos = jnp.exp(log_prob - behavior_log_probs) + + # Policy loss: Clipping + clipped_ratios_t = jnp.clip( + rhos, 1.0 - ppo_clipping_epsilon, 1.0 + ppo_clipping_epsilon + ) + clipped_objective = jnp.fmin( + rhos * advantages, clipped_ratios_t * advantages + ) + policy_loss = -jnp.mean(clipped_objective) + + # Value loss: MSE + value_cost = value_coeff + unclipped_value_error = target_values - values + unclipped_value_loss = unclipped_value_error**2 + + # Value clipping + if clip_value: + # Clip values to reduce variablility during critic training. + clipped_values = behavior_values + jnp.clip( + values - behavior_values, + -ppo_clipping_epsilon, + ppo_clipping_epsilon, + ) + clipped_value_error = target_values - clipped_values + clipped_value_loss = clipped_value_error**2 + value_loss = jnp.mean( + jnp.fmax(unclipped_value_loss, clipped_value_loss) + ) + else: + value_loss = jnp.mean(unclipped_value_loss) + + # Entropy loss: Standard entropy term + # Calculate the new value based on linear annealing formula + if anneal_entropy: + fraction = jnp.fmax(1 - timesteps / entropy_coeff_horizon, 0) + entropy_cost = ( + fraction * entropy_coeff_start + + (1 - fraction) * entropy_coeff_end + ) + # Constant Entropy term + else: + entropy_cost = entropy_coeff_start + entropy_loss = -jnp.mean(entropy) + + # Total loss: Minimize policy and value loss; maximize entropy + total_loss = ( + policy_loss + + entropy_cost * entropy_loss + + value_loss * value_cost + ) + + return total_loss, { + "loss_total": total_loss, + "loss_policy": policy_loss, + "loss_value": value_loss, + "loss_entropy": entropy_loss, + "entropy_cost": entropy_cost, + } + + @jax.jit + def sgd_step( + state: TrainingState, sample: NamedTuple + ) -> Tuple[TrainingState, Dict[str, jnp.ndarray]]: + """Performs a minibatch SGD step, returning new state and metrics.""" + # Extract data + ( + observations, + actions, + rewards, + behavior_log_probs, + behavior_values, + dones, + hiddens, + ) = ( + sample.observations, + sample.actions, + sample.rewards, + sample.behavior_log_probs, + sample.behavior_values, + sample.dones, + sample.hiddens, + ) + + # batch_gae_advantages = jax.vmap(gae_advantages, 1, (0, 0)) + advantages, target_values = gae_advantages( + rewards=rewards, values=behavior_values, dones=dones + ) + + # Exclude the last step - it was only used for bootstrapping. + # The shape is [num_steps, num_envs, ..] + behavior_values = behavior_values[:-1, :] + trajectories = Batch( + observations=observations, + actions=actions, + advantages=advantages, + behavior_log_probs=behavior_log_probs, + target_values=target_values, + behavior_values=behavior_values, + hiddens=hiddens, + ) + # Concatenate all trajectories. Reshape from [num_envs, num_steps, ..] + # to [num_envs * num_steps,..] + assert len(target_values.shape) > 1 + num_envs = target_values.shape[1] + num_steps = target_values.shape[0] + batch_size = num_envs * num_steps + assert batch_size % num_minibatches == 0, ( + "Num minibatches must divide batch size. Got batch_size={}" + " num_minibatches={}." + ).format(batch_size, num_minibatches) + + batch = jax.tree_util.tree_map( + lambda x: x.reshape((batch_size,) + x.shape[2:]), trajectories + ) + # Compute gradients. + grad_fn = jax.jit(jax.grad(loss, has_aux=True)) + + def model_update_minibatch( + carry: Tuple[hk.Params, optax.OptState, int], + minibatch: Batch, + ) -> Tuple[ + Tuple[hk.Params, optax.OptState, int], Dict[str, jnp.ndarray] + ]: + """Performs model update for a single minibatch.""" + params, opt_state, timesteps = carry + # Normalize advantages at the minibatch level before using them. + advantages = ( + minibatch.advantages + - jnp.mean(minibatch.advantages, axis=0) + ) / (jnp.std(minibatch.advantages, axis=0) + 1e-8) + gradients, metrics = grad_fn( + params, + timesteps, + minibatch.observations, + minibatch.actions, + minibatch.behavior_log_probs, + minibatch.target_values, + advantages, + minibatch.behavior_values, + minibatch.hiddens, + ) + + # Apply updates + updates, opt_state = optimizer.update(gradients, opt_state) + params = optax.apply_updates(params, updates) + + metrics["norm_grad"] = optax.global_norm(gradients) + metrics["norm_updates"] = optax.global_norm(updates) + return (params, opt_state, timesteps), metrics + + def model_update_epoch( + carry: Tuple[ + jnp.ndarray, hk.Params, optax.OptState, int, Batch + ], + unused_t: Tuple[()], + ) -> Tuple[ + Tuple[jnp.ndarray, hk.Params, optax.OptState, Batch], + Dict[str, jnp.ndarray], + ]: + """Performs model updates based on one epoch of data.""" + key, params, opt_state, timesteps, batch = carry + key, subkey = jax.random.split(key) + permutation = jax.random.permutation(subkey, batch_size) + shuffled_batch = jax.tree_util.tree_map( + lambda x: jnp.take(x, permutation, axis=0), batch + ) + minibatches = jax.tree_util.tree_map( + lambda x: jnp.reshape( + x, [num_minibatches, -1] + list(x.shape[1:]) + ), + shuffled_batch, + ) + + (params, opt_state, timesteps), metrics = jax.lax.scan( + model_update_minibatch, + (params, opt_state, timesteps), + minibatches, + length=num_minibatches, + ) + return (key, params, opt_state, timesteps, batch), metrics + + params = state.params + opt_state = state.opt_state + timesteps = state.timesteps + + # Repeat training for the given number of epoch, taking a random + # permutation for every epoch. + # signature is scan(function, carry, tuple to iterate over, length) + (key, params, opt_state, timesteps, _), metrics = jax.lax.scan( + model_update_epoch, + (state.random_key, params, opt_state, timesteps, batch), + (), + length=num_epochs, + ) + + metrics = jax.tree_util.tree_map(jnp.mean, metrics) + metrics["rewards_mean"] = jnp.mean( + jnp.abs(jnp.mean(rewards, axis=(0, 1))) + ) + metrics["rewards_std"] = jnp.std(rewards, axis=(0, 1)) + + # Reset the memory + new_state = TrainingState( + params=params, + opt_state=opt_state, + random_key=key, + timesteps=timesteps + batch_size, + ) + + new_memory = MemoryState( + hidden=jnp.zeros(shape=(self._num_envs,) + (gru_dim,)), + extras={ + "log_probs": jnp.zeros(self._num_envs), + "values": jnp.zeros(self._num_envs), + }, + ) + + return new_state, new_memory, metrics + + def make_initial_state( + key: Any, initial_hidden_state: jnp.ndarray + ) -> TrainingState: + """Initialises the training state (parameters and optimiser state).""" + + # We pass through initial_hidden_state so its easy to batch memory + key, subkey = jax.random.split(key) + + if isinstance(obs_spec, dict): + dummy_obs = {} + for k, v in obs_spec.items(): + dummy_obs[k] = jnp.zeros(shape=v) + + else: + dummy_obs = jnp.zeros(shape=obs_spec) + dummy_obs = utils.add_batch_dim(dummy_obs) + initial_params = network.init( + subkey, dummy_obs, initial_hidden_state + ) + initial_opt_state = optimizer.init(initial_params) + return TrainingState( + random_key=key, + params=initial_params, + opt_state=initial_opt_state, + timesteps=0, + ), MemoryState( + hidden=jnp.zeros( + (num_envs, initial_hidden_state.shape[-1]) + ), # initial_hidden_state, + extras={ + "values": jnp.zeros(num_envs), + "log_probs": jnp.zeros(num_envs), + }, + ) + + # @jax.jit + def prepare_batch( + traj_batch: NamedTuple, + done: Any, + action_extras: dict, + ): + # Rollouts complete -> Training begins + # Add an additional rollout step for advantage calculation + _value = jax.lax.select( + done, + jnp.zeros_like(action_extras["values"]), + action_extras["values"], + ) + + _value = jax.lax.expand_dims(_value, [0]) + + # need to add final value here + traj_batch = traj_batch._replace( + behavior_values=jnp.concatenate( + [traj_batch.behavior_values, _value], axis=0 + ) + ) + return traj_batch + + # Initialise training state (parameters, optimiser state, extras). + self._state, self._mem = make_initial_state( + random_key, initial_hidden_state + ) + + self.make_initial_state = make_initial_state + + self._prepare_batch = prepare_batch + self._sgd_step = jax.jit(sgd_step) + + # Set up counters and logger + self._logger = Logger() + self._total_steps = 0 + self._until_sgd = 0 + self._logger.metrics = { + "total_steps": 0, + "sgd_steps": 0, + "loss_total": 0, + "loss_policy": 0, + "loss_value": 0, + "loss_entropy": 0, + "entropy_cost": entropy_coeff_start, + } + + # Initialize functions + self._policy = policy + self.forward = network.apply + self.player_id = player_id + + # Other useful hyperparameters + self._num_envs = num_envs # number of environments + self._num_minibatches = num_minibatches # number of minibatches + self._num_epochs = num_epochs # number of epochs to use sample + self._gru_dim = gru_dim + + def reset_memory(self, memory, eval=False) -> TrainingState: + num_envs = 1 if eval else self._num_envs + memory = memory._replace( + extras={ + "values": jnp.zeros(num_envs), + "log_probs": jnp.zeros(num_envs), + }, + hidden=jnp.zeros((num_envs, self._gru_dim)), + ) + return memory + + def update( + self, + traj_batch: NamedTuple, + obs: jnp.ndarray, + state: TrainingState, + mem: MemoryState, + ): + + """Update the agent -> only called at the end of a trajectory""" + + _, _, mem = self._policy(state, obs, mem) + traj_batch = self._prepare_batch( + traj_batch, traj_batch.dones[-1, ...], mem.extras + ) + state, mem, metrics = self._sgd_step(state, traj_batch) + + # update logging + + self._logger.metrics["sgd_steps"] += ( + self._num_minibatches * self._num_epochs + ) + self._logger.metrics["loss_total"] = metrics["loss_total"] + self._logger.metrics["loss_policy"] = metrics["loss_policy"] + self._logger.metrics["loss_value"] = metrics["loss_value"] + self._logger.metrics["loss_entropy"] = metrics["loss_entropy"] + self._logger.metrics["entropy_cost"] = metrics["entropy_cost"] + return state, mem, metrics + + +# TODO: seed, and player_id not used in CartPole +def make_shaper_agent( + args, + agent_args, + obs_spec, + action_spec, + seed: int, + num_iterations: int, + player_id: int, +): + """Make PPO agent""" + # Network + if args.env_id == "CartPole-v1": + network, initial_hidden_state = make_GRU_cartpole_network(action_spec) + elif args.env_id == "coin_game": + network, initial_hidden_state = make_GRU_coingame_att_network( + action_spec, + agent_args.with_cnn, + agent_args.hidden_size, + agent_args.output_channels, + agent_args.kernel_shape, + ) + elif args.env_id == "iterated_matrix_game": + if args.att_type=='att': + network, initial_hidden_state = make_GRU_ipd_att_network( + action_spec, agent_args.hidden_size + ) + elif args.att_type=='avg': + network, initial_hidden_state = make_GRU_ipd_avg_network( + action_spec, agent_args.hidden_size + ) + elif args.att_type=='nothing': + network, initial_hidden_state = make_GRU_ipd_network( + action_spec, agent_args.hidden_size + ) + + elif args.env_id == "InTheMatrix": + if args.att_type=='avg': + network, initial_hidden_state = make_GRU_ipditm_avg_network( + action_spec, + agent_args.hidden_size, + agent_args.separate, + agent_args.output_channels, + agent_args.kernel_shape, + ) + if args.att_type=='att': + network, initial_hidden_state = make_GRU_ipditm_att_network( + action_spec, + agent_args.hidden_size, + agent_args.separate, + agent_args.output_channels, + agent_args.kernel_shape, + ) + + gru_dim = initial_hidden_state.shape[1] + + initial_hidden_state = jnp.zeros( + (args.num_envs, initial_hidden_state.shape[1]) + ) + + # Optimizer + transition_steps = ( + num_iterations * agent_args.num_epochs * agent_args.num_minibatches + ) + + if agent_args.lr_scheduling: + scheduler = optax.linear_schedule( + init_value=agent_args.learning_rate, + end_value=0, + transition_steps=transition_steps, + ) + optimizer = optax.chain( + optax.clip_by_global_norm(agent_args.max_gradient_norm), + optax.scale_by_adam(eps=agent_args.adam_epsilon), + optax.scale_by_schedule(scheduler), + optax.scale(-1), + ) + + else: + optimizer = optax.chain( + optax.clip_by_global_norm(agent_args.max_gradient_norm), + optax.scale_by_adam(eps=agent_args.adam_epsilon), + optax.scale(-agent_args.learning_rate), + ) + + # Random key + random_key = jax.random.PRNGKey(seed=seed) + + agent = PPO( + network=network, + initial_hidden_state=initial_hidden_state, + optimizer=optimizer, + random_key=random_key, + gru_dim=gru_dim, + obs_spec=obs_spec, + num_envs=args.num_envs, + num_minibatches=agent_args.num_minibatches, + num_epochs=agent_args.num_epochs, + clip_value=agent_args.clip_value, + value_coeff=agent_args.value_coeff, + anneal_entropy=agent_args.anneal_entropy, + entropy_coeff_start=agent_args.entropy_coeff_start, + entropy_coeff_end=agent_args.entropy_coeff_end, + entropy_coeff_horizon=agent_args.entropy_coeff_horizon, + ppo_clipping_epsilon=agent_args.ppo_clipping_epsilon, + gamma=agent_args.gamma, + gae_lambda=agent_args.gae_lambda, + player_id=player_id, + ) + return agent + + +if __name__ == "__main__": + pass diff --git a/pax/conf/experiment/imp/mfos_att_v_tabular.yaml b/pax/conf/experiment/imp/mfos_att_v_tabular.yaml new file mode 100644 index 00000000..963289cd --- /dev/null +++ b/pax/conf/experiment/imp/mfos_att_v_tabular.yaml @@ -0,0 +1,115 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[1, -1], [-1, 1], [-1, 1], [1, -1]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +num_devices: 1 +att_type: 'avg' + +# MFOS vs. Tabular trained on seed = 0 +run_path: ucl-dark/ipd/1r9txdso +model_path: exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400 +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: imp + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize} + log: False + + diff --git a/pax/conf/experiment/imp/shaper_att_v_tabular.yaml b/pax/conf/experiment/imp/shaper_att_v_tabular.yaml new file mode 100644 index 00000000..4fa219de --- /dev/null +++ b/pax/conf/experiment/imp/shaper_att_v_tabular.yaml @@ -0,0 +1,103 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[1, -1], [-1, 1], [-1, 1], [1, -1]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +att_type: 'avg' + +# Evaluation +run_path: ucl-dark/ipd/1ui7wfop +model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: imp + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps} + log: False + + diff --git a/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml b/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml new file mode 100644 index 00000000..410c12d5 --- /dev/null +++ b/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml @@ -0,0 +1,104 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +att_type: 'avg' + +# Evaluation +AVG-model +run_path: ucl-dark/imp/1dfrc0c5 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10/2023-05-11_15.30.16.570714/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type} + log: False + + diff --git a/pax/conf/experiment/impitm/train_shaper_att.yaml b/pax/conf/experiment/impitm/train_shaper_att.yaml new file mode 100644 index 00000000..b36e7ed3 --- /dev/null +++ b/pax/conf/experiment/impitm/train_shaper_att.yaml @@ -0,0 +1,116 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'PPO_memory' + +# Environment +env_id: InTheMatrix +env_type: meta +env_discount: 0.96 +freeze: 5 +payoff: [[[1, -1], [-1, 1]], [[-1, 1], [1, -1]]] +fixed_coins: False + +# Save +save: True +save_interval: 100 +benchmark: False + +# Runner +runner: evo + +# Training +top_k: 8 +popsize: 128 #512 +# total popsize = popsize * num_devices +num_envs: 50 +num_opps: 1 +num_devices: 8 +num_outer_steps: 500 +num_inner_steps: 152 +num_iters: 5000 +att_type: avg + +# Evaluation +run_path: ucl-dark/cg/3mpgbfm2 +model_path: exp/coin_game-EARL-PPO_memory-vs-Random/run-seed-0/2022-09-08_20.41.03.643377/generation_30 + +# PPO agent parameters +ppo1: + num_minibatches: 8 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.1 + entropy_coeff_horizon: 0.6e8 + entropy_coeff_end: 0.005 + lr_scheduling: False + learning_rate: 0.005 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: True + output_channels: 16 + kernel_shape: [3, 3] + separate: False # only works with CNN + hidden_size: 32 + +ppo2: + num_minibatches: 8 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.1 + entropy_coeff_horizon: 0.6e8 + entropy_coeff_end: 0.005 + lr_scheduling: False + learning_rate: 0.005 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: True + output_channels: 16 + kernel_shape: [3, 3] + separate: True # only works with CNN + hidden_size: 8 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES, SimpleGA] + sigma_init: 0.075 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.05 # Initial learning rate + lrate_decay: 0.999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + elite_ratio: 0.1 + centered_rank: True # Fitness centered_rank + w_decay: 0.1 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: False # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipditm + group: 'shaping-${agent1}-vs-${agent2}' + name: run-seed-${seed} + log: True \ No newline at end of file diff --git a/pax/conf/experiment/ipd/gs_v_ppo.yaml b/pax/conf/experiment/ipd/gs_v_ppo.yaml index b2ef7f3e..051dd9ba 100644 --- a/pax/conf/experiment/ipd/gs_v_ppo.yaml +++ b/pax/conf/experiment/ipd/gs_v_ppo.yaml @@ -11,7 +11,7 @@ env_discount: 0.96 payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] # Runner -runner: evo +runner: evo # Training top_k: 5 @@ -44,6 +44,26 @@ ppo1: with_cnn: False hidden_size: 16 +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + # ES parameters es: algo: OpenES # [OpenES, CMA_ES] diff --git a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml index 55995b03..6bc1de60 100644 --- a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml +++ b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml @@ -17,7 +17,7 @@ runner: evo top_k: 5 popsize: 1000 num_envs: 2 -num_opps: 1 +num_opps: 10 num_outer_steps: 100 num_inner_steps: 100 num_iters: 5000 @@ -44,7 +44,25 @@ ppo1: with_cnn: False hidden_size: 16 - +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 # ES parameters es: diff --git a/pax/conf/experiment/ipd/gs_v_tabular.yaml b/pax/conf/experiment/ipd/gs_v_tabular.yaml index 639a3cf4..aac6e0f4 100644 --- a/pax/conf/experiment/ipd/gs_v_tabular.yaml +++ b/pax/conf/experiment/ipd/gs_v_tabular.yaml @@ -18,45 +18,12 @@ top_k: 5 popsize: 1000 num_envs: 2 num_opps: 1 -num_outer_steps: 1 +num_outer_steps: 100 num_inner_steps: 100 num_iters: 5000 num_devices: 1 -# Evaluation -# GS vs. Tabular trained on seed=0, where Naive Learners have their learning rate annealed halfway through the trial -# run_path: ucl-dark/ipd/1gg0p92x -# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_01.57.34.854198/generation_4900 - -# GS vs. Tabular trained on seed=1, where Naive Learners have their learning rate annealed halfway through the trial -# run_path: ucl-dark/ipd/scffrmfv -# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-28_05.00.56.131987/generation_4900 - -# GS vs. Tabular trained on seed=2, where Naive Learners have their learning rate annealed halfway through the trial -# run_path: ucl-dark/ipd/2858x8sa -# model_path: exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-28_07.38.37.221049/generation_4900 - -# GS vs. Tabular trained on seed=3, where Naive Learners have their learning rate annealed halfway through the trial -# run_path: ucl-dark/ipd/1y9tefvj -# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-28_01.57.40.696321/generation_4900 - -# GS vs. Tabular trained on seed=4, where Naive Learners have their learning rate annealed halfway through the trial -# run_path: ucl-dark/ipd/8j6zmb6h -# model_path: exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-28_05.11.49.206169/generation_4900 - -# GS vs. Tabular trained on seed = 0 -# run_path: ucl-dark/ipd/tywwxijw -# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_16.06.55.715665/generation_4900 -# GS vs. Tabular trained on seed = 1 -# run_path: ucl-dark/ipd/2lyn9n10 -# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_16.07.48.978281/generation_4900 -# GS vs. Tabular trained on seed = 2 -# run_path: ucl-dark/ipd/f2xhuhcz -# model_path: exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_16.08.35.015944/generation_4900 -# GS vs. Tabular trained on seed = 3 -# run_path: ucl-dark/ipd/16wzxeb6 -# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_16.09.01.274669/generation_4900 -# GS vs. Tabular trained on seed = 4 +# Evaluation run_path: ucl-dark/ipd/3dzkof3f model_path: exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_16.41.50.643263/generation_4900 @@ -81,6 +48,27 @@ ppo1: with_cnn: False hidden_size: 16 +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + # ES parameters es: algo: OpenES # [OpenES, CMA_ES] @@ -91,7 +79,7 @@ es: init_max: 0.0 # Range of parameter mean initialization - Max clip_min: -1e10 # Range of parameter proposals - Min clip_max: 1e10 # Range of parameter proposals - Max - lrate_init: 0.01 # Initial learning rate + lrate_init: 0.1 # Initial learning rate lrate_decay: 0.9999 # Multiplicative decay factor lrate_limit: 0.001 # Smallest possible lrate beta_1: 0.99 # Adam - beta_1 diff --git a/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml new file mode 100644 index 00000000..ace3f5aa --- /dev/null +++ b/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml @@ -0,0 +1,140 @@ +# @package _global_ + +# Agents +agent1: 'PPO' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: eval_hardstop + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +stop: 2 + +# run_path: ucl-dark/ipd/3ipiqfwz +# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2023-05-23_13.41.36.367352/generation_900 + +run_path: ucl-dark/ipd/hl9q06ix +model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2023-05-23_15.00.59.246054/generation_300 +# Evaluation +# # AVG-model 0 +# run_path: ucl-dark/ipd/1n313hkb +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml b/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml new file mode 100644 index 00000000..d82a379b --- /dev/null +++ b/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml @@ -0,0 +1,104 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'PPO' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +num_devices: 1 +att_type: 'avg' + +# MFOS vs. Tabular trained on seed = 0 +run_path: ucl-dark/ipd/1r9txdso +model_path: exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400 + + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize} + log: False + + diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml new file mode 100644 index 00000000..e1c4900c --- /dev/null +++ b/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml @@ -0,0 +1,104 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +num_devices: 1 +att_type: 'avg' + +# MFOS vs. Tabular trained on seed = 0 +run_path: ucl-dark/ipd/1r9txdso +model_path: exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400 + + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize} + log: False + + diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml new file mode 100644 index 00000000..a66b2aa9 --- /dev/null +++ b/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: eval_hardstop + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +stop: 100 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +run_path: ucl-dark/ipd/4ykf9oe8 +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml new file mode 100644 index 00000000..3d641a1a --- /dev/null +++ b/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml @@ -0,0 +1,155 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/mfos_v_tabular.yaml b/pax/conf/experiment/ipd/mfos_v_tabular.yaml index ea55983e..f414ef45 100644 --- a/pax/conf/experiment/ipd/mfos_v_tabular.yaml +++ b/pax/conf/experiment/ipd/mfos_v_tabular.yaml @@ -17,7 +17,7 @@ runner: evo top_k: 5 popsize: 1000 num_envs: 2 -num_opps: 1 +num_opps: 10 num_outer_steps: 100 num_inner_steps: 100 num_iters: 5000 diff --git a/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml b/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml index 61ec02ee..5314a22b 100644 --- a/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml +++ b/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml @@ -42,7 +42,27 @@ ppo1: entropy_coeff_horizon: 1e7 entropy_coeff_end: 0.001 lr_scheduling: True - learning_rate: 0.001 + learning_rate: 0.1 #0.001 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 4 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: True + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 1e7 + entropy_coeff_end: 0.001 + lr_scheduling: True + learning_rate: 0.1 adam_epsilon: 1e-5 with_memory: True with_cnn: False diff --git a/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml b/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml new file mode 100644 index 00000000..5f640045 --- /dev/null +++ b/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml @@ -0,0 +1,103 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'PPO' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +num_devices: 1 +att_type: 'nothing' + +# Evaluation +run_path: ucl-dark/ipd/1ui7wfop +model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.01 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps} + log: False + + diff --git a/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml b/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml new file mode 100644 index 00000000..123dfb16 --- /dev/null +++ b/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml @@ -0,0 +1,98 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'PPO_memory' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +num_devices: 1 +att_type: 'nothing' + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 0.1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 0.1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.01 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps} + log: False + + diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml new file mode 100644 index 00000000..c760e511 --- /dev/null +++ b/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml @@ -0,0 +1,104 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: evo_mixed_payoff + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 10 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1000 +att_type: 'nothing' +num_devices: 1 + +# Evaluation +run_path: ucl-dark/ipd/1ui7wfop +model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type} + log: False + + diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml new file mode 100644 index 00000000..b023a2e2 --- /dev/null +++ b/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml @@ -0,0 +1,135 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: eval_hardstop + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +stop: 2 + +# Evaluation +# # AVG-model 0 +# run_path: ucl-dark/ipd/1n313hkb +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +run_path: ucl-dark/ipd/2m3wh5g7 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml new file mode 100644 index 00000000..5e37177c --- /dev/null +++ b/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml @@ -0,0 +1,134 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' + +# Evaluation +# # AVG-model 0 +run_path: ucl-dark/ipd/1n313hkb +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/shaper_v_ppo.yaml b/pax/conf/experiment/ipd/shaper_v_ppo.yaml index 1543cb30..8ce491e7 100644 --- a/pax/conf/experiment/ipd/shaper_v_ppo.yaml +++ b/pax/conf/experiment/ipd/shaper_v_ppo.yaml @@ -17,7 +17,7 @@ runner: evo top_k: 5 popsize: 1000 num_envs: 2 -num_opps: 1 +num_opps: 10 num_outer_steps: 100 num_inner_steps: 100 num_iters: 5000 @@ -79,7 +79,7 @@ es: init_max: 0.0 # Range of parameter mean initialization - Max clip_min: -1e10 # Range of parameter proposals - Min clip_max: 1e10 # Range of parameter proposals - Max - lrate_init: 0.01 # Initial learning rate + lrate_init: 0.1 # Initial learning rate lrate_decay: 0.9999 # Multiplicative decay factor lrate_limit: 0.001 # Smallest possible lrate beta_1: 0.99 # Adam - beta_1 diff --git a/pax/conf/experiment/ipd/shaper_v_tabular.yaml b/pax/conf/experiment/ipd/shaper_v_tabular.yaml index 10ba189f..364a14d2 100644 --- a/pax/conf/experiment/ipd/shaper_v_tabular.yaml +++ b/pax/conf/experiment/ipd/shaper_v_tabular.yaml @@ -17,7 +17,7 @@ runner: evo top_k: 5 popsize: 1000 num_envs: 2 -num_opps: 1 +num_opps: 10 num_outer_steps: 100 num_inner_steps: 100 num_iters: 5000 diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml new file mode 100644 index 00000000..f66ebd0f --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +run_path: ucl-dark/ipd/4ykf9oe8 +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml new file mode 100644 index 00000000..3ca9736f --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +run_path: ucl-dark/ipd/eopf93re +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml new file mode 100644 index 00000000..60384066 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +run_path: ucl-dark/ipd/1sqbd09n +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml new file mode 100644 index 00000000..4042ad3d --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +run_path: ucl-dark/ipd/3n7l8ods +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml new file mode 100644 index 00000000..e869d86e --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +run_path: ucl-dark/ipd/4mf1ecxq +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml new file mode 100644 index 00000000..ddbd91c4 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +run_path: ucl-dark/ipd/4ykf9oe8 +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml new file mode 100644 index 00000000..4307f733 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +run_path: ucl-dark/ipd/eopf93re +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml new file mode 100644 index 00000000..f39b4163 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +run_path: ucl-dark/ipd/1sqbd09n +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml new file mode 100644 index 00000000..7ea81f4d --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +run_path: ucl-dark/ipd/3n7l8ods +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml new file mode 100644 index 00000000..90533f91 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +run_path: ucl-dark/ipd/4mf1ecxq +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml new file mode 100644 index 00000000..e147b834 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +run_path: ucl-dark/ipd/4ykf9oe8 +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml new file mode 100644 index 00000000..88552d56 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +run_path: ucl-dark/ipd/eopf93re +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml new file mode 100644 index 00000000..1ced362c --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +run_path: ucl-dark/ipd/1sqbd09n +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml new file mode 100644 index 00000000..9484f767 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +run_path: ucl-dark/ipd/3n7l8ods +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +# run_path: ucl-dark/ipd/4mf1ecxq +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml new file mode 100644 index 00000000..b60172de --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml @@ -0,0 +1,156 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'avg' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 +# AVG model seed=23 +# run_path: ucl-dark/ipd/4ykf9oe8 +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 + +## avg model 1 seed=65 +# run_path: ucl-dark/ipd/eopf93re +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 + +# #avg model 2 seed=47 +# run_path: ucl-dark/ipd/1sqbd09n +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 + +# ## avg model 3 seed=8 +# run_path: ucl-dark/ipd/3n7l8ods +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 + +# ## avg model 4 seed=6 +run_path: ucl-dark/ipd/4mf1ecxq +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 + +# #nothing model, seed=23 +# run_path: ucl-dark/ipd/3i5m1agd +# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml new file mode 100644 index 00000000..389cbe64 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml @@ -0,0 +1,137 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=23 +run_path: ucl-dark/ipd/3i5m1agd +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml new file mode 100644 index 00000000..edf3b243 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml @@ -0,0 +1,134 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=65 +run_path: ucl-dark/ipd/1s3kty0d +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml new file mode 100644 index 00000000..3b021ef9 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml @@ -0,0 +1,130 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=47 +run_path: ucl-dark/ipd/37v877f5 +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml new file mode 100644 index 00000000..2dd1a6f8 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml @@ -0,0 +1,125 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=8 +run_path: ucl-dark/ipd/1wcrrl9h +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml new file mode 100644 index 00000000..ca5deaae --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml @@ -0,0 +1,121 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=6 +run_path: ucl-dark/ipd/1vkddd7q +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml new file mode 100644 index 00000000..7787db5c --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml @@ -0,0 +1,137 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=23 +run_path: ucl-dark/ipd/3i5m1agd +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml new file mode 100644 index 00000000..f3f34090 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml @@ -0,0 +1,134 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=65 +run_path: ucl-dark/ipd/1s3kty0d +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml new file mode 100644 index 00000000..cb33a46c --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml @@ -0,0 +1,130 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=47 +run_path: ucl-dark/ipd/37v877f5 +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml new file mode 100644 index 00000000..afbdf4fd --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml @@ -0,0 +1,125 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=8 +run_path: ucl-dark/ipd/1wcrrl9h +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml new file mode 100644 index 00000000..a9a1f7ca --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml @@ -0,0 +1,121 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=6 +run_path: ucl-dark/ipd/1vkddd7q +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml new file mode 100644 index 00000000..332d7f09 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml @@ -0,0 +1,137 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 23 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=23 +run_path: ucl-dark/ipd/3i5m1agd +model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900 + +# #nothing model, seed=65 +# run_path: ucl-dark/ipd/1s3kty0d +# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml new file mode 100644 index 00000000..028a2570 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml @@ -0,0 +1,134 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 65 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=65 +run_path: ucl-dark/ipd/1s3kty0d +model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900 + +# #nothing model, seed=47 +# run_path: ucl-dark/ipd/37v877f5 +# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml new file mode 100644 index 00000000..a78712ef --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml @@ -0,0 +1,130 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 47 + +# MFOS vs. Tabular trained on seed = 0 + + +# #nothing model, seed=47 +run_path: ucl-dark/ipd/37v877f5 +model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900 + +# #nothing model, seed=8 +# run_path: ucl-dark/ipd/1wcrrl9h +# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml new file mode 100644 index 00000000..ab859fad --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml @@ -0,0 +1,125 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 8 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=8 +run_path: ucl-dark/ipd/1wcrrl9h +model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900 + +# #nothing model, seed=6 +# run_path: ucl-dark/ipd/1vkddd7q +# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml new file mode 100644 index 00000000..2fc8c00d --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml @@ -0,0 +1,121 @@ +# @package _global_ + +# Agents +agent1: 'MFOS' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_outer_steps: 100 +num_inner_steps: 100 +num_steps: 10000 +num_iters: 1 +num_devices: 1 +att_type: 'nothing' +seed: 6 + +# MFOS vs. Tabular trained on seed = 0 + +# #nothing model, seed=6 +run_path: ucl-dark/ipd/1vkddd7q +model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900 + + +# MFOS vs. Tabular trained on seed = 1 +# run_path: ucl-dark/ipd/13srlkhp +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300 +# MFOS vs. Tabular trained on seed = 2 +# run_path: ucl-dark/ipd/3pfmqrpw +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400 +# MFOS vs. Tabular trained on seed = 3 +# run_path: ucl-dark/ipd/groh4iwx +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400 +# MFOS vs. Tabular trained on seed = 4 +# run_path: ucl-dark/ipd/26cqaqyc +# model_path: exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: False + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: True # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: '${agent1}-vs-${agent2}' + name: run-seed-${seed}-pop-size-${popsize}-stevie + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml new file mode 100644 index 00000000..2c4d74a4 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml @@ -0,0 +1,135 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 65 + +# Evaluation +# # AVG-model 0 +run_path: ucl-dark/ipd/1n313hkb +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml new file mode 100644 index 00000000..57455307 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml @@ -0,0 +1,132 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 23 + +# Evaluation + +## avg-model 1 seed23 +run_path: ucl-dark/ipd/2jtks2rd +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml new file mode 100644 index 00000000..013741f7 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml @@ -0,0 +1,128 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 6 + +# Evaluation + +## avg-model 2 seed 6 +run_path: ucl-dark/ipd/2d4s9hl2 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml new file mode 100644 index 00000000..24ceeebd --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml @@ -0,0 +1,135 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 65 + +# Evaluation +# # AVG-model 0 +run_path: ucl-dark/ipd/1n313hkb +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml new file mode 100644 index 00000000..4c412365 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml @@ -0,0 +1,132 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 23 + +# Evaluation + +## avg-model 1 seed23 +run_path: ucl-dark/ipd/2jtks2rd +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml new file mode 100644 index 00000000..5555c0e5 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml @@ -0,0 +1,128 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 6 + +# Evaluation + +## avg-model 2 seed 6 +run_path: ucl-dark/ipd/2d4s9hl2 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml new file mode 100644 index 00000000..93f850c1 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml @@ -0,0 +1,135 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 65 + +# Evaluation +# # AVG-model 0 +run_path: ucl-dark/ipd/1n313hkb +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900 + +## avg-model 1 seed23 +# run_path: ucl-dark/ipd/2jtks2rd +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml new file mode 100644 index 00000000..a276e9c2 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml @@ -0,0 +1,132 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 23 + +# Evaluation + +## avg-model 1 seed23 +run_path: ucl-dark/ipd/2jtks2rd +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900 + +## avg-model 2 seed 6 +# run_path: ucl-dark/ipd/2d4s9hl2 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml new file mode 100644 index 00000000..3ca42a9c --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml @@ -0,0 +1,128 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'avg' +seed: 6 + +# Evaluation + +## avg-model 2 seed 6 +run_path: ucl-dark/ipd/2d4s9hl2 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900 + + +# # nothing-model 0 +# run_path: ucl-dark/ipd/2jpssoai +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml new file mode 100644 index 00000000..7c853ef8 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml @@ -0,0 +1,123 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 36 + +# Evaluation + +# # nothing-model 0 +run_path: ucl-dark/ipd/2jpssoai +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml new file mode 100644 index 00000000..9a417276 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml @@ -0,0 +1,119 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 65 + +# Evaluation + +# # nothing-model 1 seed 65 +run_path: ucl-dark/ipd/2m3wh5g7 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml new file mode 100644 index 00000000..4eb364ef --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml @@ -0,0 +1,115 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 47 + +# Evaluation + +# # nothing-model 2 47 +run_path: ucl-dark/ipd/1jk5zly5 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml new file mode 100644 index 00000000..2a0106e4 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml @@ -0,0 +1,111 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 23 + +# Evaluation + +# # nothing-model 3 23 +run_path: ucl-dark/ipd/1cvpiolk +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml new file mode 100644 index 00000000..3954f424 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml @@ -0,0 +1,107 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 10 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 6 + +# Evaluation + +# # nothing-model 4 6 +run_path: ucl-dark/ipd/3vml0wjy +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml new file mode 100644 index 00000000..e5c993b2 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml @@ -0,0 +1,123 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 36 + +# Evaluation + +# # nothing-model 0 +run_path: ucl-dark/ipd/2jpssoai +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml new file mode 100644 index 00000000..03cb9e85 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml @@ -0,0 +1,119 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 65 + +# Evaluation + +# # nothing-model 1 seed 65 +run_path: ucl-dark/ipd/2m3wh5g7 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml new file mode 100644 index 00000000..b78a948b --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml @@ -0,0 +1,115 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 47 + +# Evaluation + +# # nothing-model 2 47 +run_path: ucl-dark/ipd/1jk5zly5 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml new file mode 100644 index 00000000..d1ed08a2 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml @@ -0,0 +1,111 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 23 + +# Evaluation + +# # nothing-model 3 23 +run_path: ucl-dark/ipd/1cvpiolk +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml new file mode 100644 index 00000000..136903f9 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml @@ -0,0 +1,107 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 20 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 6 + +# Evaluation + +# # nothing-model 4 6 +run_path: ucl-dark/ipd/3vml0wjy +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml new file mode 100644 index 00000000..5e371be3 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml @@ -0,0 +1,123 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 36 + +# Evaluation + +# # nothing-model 0 +run_path: ucl-dark/ipd/2jpssoai +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900 + +# # nothing-model 1 seed 65 +# run_path: ucl-dark/ipd/2m3wh5g7 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml new file mode 100644 index 00000000..bcc10e07 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml @@ -0,0 +1,119 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 65 + +# Evaluation + +# # nothing-model 1 seed 65 +run_path: ucl-dark/ipd/2m3wh5g7 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 + +# # nothing-model 2 47 +# run_path: ucl-dark/ipd/1jk5zly5 +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml new file mode 100644 index 00000000..1a108e1f --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml @@ -0,0 +1,115 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 47 + +# Evaluation + +# # nothing-model 2 47 +run_path: ucl-dark/ipd/1jk5zly5 +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 + +# # nothing-model 3 23 +# run_path: ucl-dark/ipd/1cvpiolk +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml new file mode 100644 index 00000000..7801c704 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml @@ -0,0 +1,111 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 23 + +# Evaluation + +# # nothing-model 3 23 +run_path: ucl-dark/ipd/1cvpiolk +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 + +# # nothing-model 4 6 +# run_path: ucl-dark/ipd/3vml0wjy +# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml new file mode 100644 index 00000000..8c308c50 --- /dev/null +++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml @@ -0,0 +1,107 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'Tabular' + +# Environment +env_id: iterated_matrix_game +env_type: meta +env_discount: 0.96 +payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]] + +# Runner +runner: stevie + +# Training +top_k: 5 +popsize: 1000 +num_envs: 2 +num_opps: 1 +num_steps: 10000 +num_outer_steps: 100 +num_inner_steps: 100 +num_iters: 1 +att_type: 'nothing' +seed: 6 + +# Evaluation + +# # nothing-model 4 6 +run_path: ucl-dark/ipd/3vml0wjy +model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 + +# PPO agent parameters +ppo1: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + +# PPO agent parameters +ppo2: + num_minibatches: 4 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.02 + entropy_coeff_horizon: 2000000 + entropy_coeff_end: 0.001 + lr_scheduling: False + learning_rate: 1 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: False + hidden_size: 16 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES] + sigma_init: 0.04 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.1 # Initial learning rate + lrate_decay: 0.9999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + centered_rank: False # Fitness centered_rank + w_decay: 0 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: True # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipd + group: 'EARL-${agent1}-vs-${agent2}' + name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop + log: False + + diff --git a/pax/conf/experiment/ipditm/train_shaper_att.yaml b/pax/conf/experiment/ipditm/train_shaper_att.yaml new file mode 100644 index 00000000..2ba52646 --- /dev/null +++ b/pax/conf/experiment/ipditm/train_shaper_att.yaml @@ -0,0 +1,116 @@ +# @package _global_ + +# Agents +agent1: 'Shaper' +agent2: 'PPO_memory' + +# Environment +env_id: InTheMatrix +env_type: meta +env_discount: 0.96 +freeze: 5 +payoff: [[[3, 0], [5, 1]], [[3, 5], [0, 1]]] +fixed_coins: True + +# Save +save: True +save_interval: 100 +benchmark: False + +# Runner +runner: evo + +# Training +top_k: 8 +popsize: 128 #512 +# total popsize = popsize * num_devices +num_envs: 50 +num_opps: 1 +num_devices: 8 +num_outer_steps: 500 +num_inner_steps: 152 +num_iters: 5000 +att_type: avg + +# Evaluation +run_path: ucl-dark/cg/3mpgbfm2 +model_path: exp/coin_game-EARL-PPO_memory-vs-Random/run-seed-0/2022-09-08_20.41.03.643377/generation_30 + +# PPO agent parameters +ppo1: + num_minibatches: 8 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.1 + entropy_coeff_horizon: 0.6e8 + entropy_coeff_end: 0.005 + lr_scheduling: False + learning_rate: 0.005 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: True + output_channels: 16 + kernel_shape: [3, 3] + separate: False # only works with CNN + hidden_size: 32 + +ppo2: + num_minibatches: 8 + num_epochs: 2 + gamma: 0.96 + gae_lambda: 0.95 + ppo_clipping_epsilon: 0.2 + value_coeff: 0.5 + clip_value: True + max_gradient_norm: 0.5 + anneal_entropy: False + entropy_coeff_start: 0.1 + entropy_coeff_horizon: 0.6e8 + entropy_coeff_end: 0.005 + lr_scheduling: False + learning_rate: 0.005 + adam_epsilon: 1e-5 + with_memory: True + with_cnn: True + output_channels: 16 + kernel_shape: [3, 3] + separate: True # only works with CNN + hidden_size: 8 + + +# ES parameters +es: + algo: OpenES # [OpenES, CMA_ES, SimpleGA] + sigma_init: 0.075 # Initial scale of isotropic Gaussian noise + sigma_decay: 0.999 # Multiplicative decay factor + sigma_limit: 0.01 # Smallest possible scale + init_min: 0.0 # Range of parameter mean initialization - Min + init_max: 0.0 # Range of parameter mean initialization - Max + clip_min: -1e10 # Range of parameter proposals - Min + clip_max: 1e10 # Range of parameter proposals - Max + lrate_init: 0.05 # Initial learning rate + lrate_decay: 0.999 # Multiplicative decay factor + lrate_limit: 0.001 # Smallest possible lrate + beta_1: 0.99 # Adam - beta_1 + beta_2: 0.999 # Adam - beta_2 + eps: 1e-8 # eps constant, + elite_ratio: 0.1 + centered_rank: True # Fitness centered_rank + w_decay: 0.1 # Decay old elite fitness + maximise: True # Maximise fitness + z_score: False # Normalise fitness + mean_reduce: False # Remove mean + +# Logging setup +wandb: + entity: "ucl-dark" + project: ipditm + group: 'shaping-${agent1}-vs-${agent2}' + name: run-seed-${seed} + log: True \ No newline at end of file diff --git a/pax/experiment.py b/pax/experiment.py index 15493d08..6ca8cb90 100644 --- a/pax/experiment.py +++ b/pax/experiment.py @@ -19,6 +19,7 @@ from pax.agents.naive_exact import NaiveExact from pax.agents.ppo.ppo import make_agent from pax.agents.ppo.ppo_gru import make_gru_agent +from pax.agents.shaper_att.ppo_gru import make_shaper_agent from pax.agents.strategies import ( Altruistic, Defect, @@ -56,17 +57,30 @@ from pax.envs.iterated_tensor_game_n_player import ( EnvParams as IteratedTensorGameNPlayerParams, ) + +from pax.runners.runner_stevie import StevieRunner +from pax.runners.runner_eval import EvalRunner +from pax.runners.runner_eval_multishaper import MultishaperEvalRunner +from pax.runners.runner_eval_hardstop import EvalHardstopRunner +from pax.runners.runner_evo import EvoRunner +from pax.runners.runner_evo_multishaper import MultishaperEvoRunner +from pax.runners.runner_evo_hardstop import EvoHardstopRunner +from pax.runners.experimental.runner_evo_mixed_lr import EvoMixedLRRunner +from pax.runners.experimental.runner_evo_mixed_payoffs import EvoMixedPayoffRunner +from pax.runners.experimental.runner_evo_mixed_IPD_payoffs import EvoMixedIPDPayoffRunner +from pax.runners.experimental.runner_evo_mixed_payoffs_input import EvoMixedPayoffInputRunner +from pax.runners.experimental.runner_evo_mixed_payoffs_gen import EvoMixedPayoffGenRunner +from pax.runners.experimental.runner_evo_mixed_payoffs_only_opp import EvoMixedPayoffOnlyOppRunner +from pax.runners.runner_evo_scanned import EvoScannedRunner + from pax.envs.iterated_tensor_game_n_player import IteratedTensorGameNPlayer from pax.envs.rice.c_rice import ClubRice from pax.envs.rice.rice import Rice, EnvParams as RiceParams from pax.envs.rice.sarl_rice import SarlRice from pax.runners.runner_evo_nroles import EvoRunnerNRoles from pax.runners.runner_weight_sharing import WeightSharingRunner -from pax.runners.runner_eval import EvalRunner -from pax.runners.runner_eval_multishaper import MultishaperEvalRunner -from pax.runners.runner_evo import EvoRunner -from pax.runners.runner_evo_multishaper import MultishaperEvoRunner from pax.runners.runner_ipditm_eval import IPDITMEvalRunner + from pax.runners.runner_marl import RLRunner from pax.runners.runner_marl_nplayer import NplayerRLRunner from pax.runners.runner_sarl import SARLRunner @@ -275,14 +289,25 @@ def runner_setup(args, env, agents, save_dir, logger): if args.runner == "eval": logger.info("Evaluating with EvalRunner") return EvalRunner(agents, env, args) + + elif args.runner == "stevie": + logger.info("Activating Stevie Wonder Mode") + return StevieRunner(agents, env, args) + + elif args.runner == "eval_hardstop": + logger.info("Activating Eval Hardstop") + return EvalHardstopRunner(agents, env, args) + elif args.runner == "multishaper_eval": logger.info("Training with multishaper eval Runner") return MultishaperEvalRunner(agents, env, save_dir, args) + elif args.runner == "ipditm_eval": logger.info("Evaluating with ipditmEvalRunner") return IPDITMEvalRunner(agents, env, save_dir, args) - if args.runner in ["evo", "multishaper_evo", "evo_nroles"]: + if args.runner in ["evo", "evo_mixed_lr", "evo_hardstop", "evo_mixed_payoff", "evo_mixed_ipd_payoff", + "evo_mixed_payoff_gen", "evo_mixed_payoff_input", "evo_scanned", "evo_mixed_payoff_only_opp", "multishaper_evo", "evo_nroles"]: agent1 = agents[0] algo = args.es.algo strategies = {"CMA_ES", "OpenES", "PGPE", "SimpleGA"} @@ -367,16 +392,46 @@ def get_pgpe_strategy(agent): strategy, es_params, param_reshaper = get_ga_strategy(agent1) logger.info(f"Evolution Strategy: {algo}") - if args.runner == "evo": - logger.info("Training with EVO runner") + + if args.runner == "evo_hardstop": + return EvoHardstopRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo": return EvoRunner( - agents, - env, - strategy, - es_params, - param_reshaper, - save_dir, - args, + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_lr": + return EvoMixedLRRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_payoff": + return EvoMixedPayoffRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_ipd_payoff": + return EvoMixedIPDPayoffRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_payoff_gen": + return EvoMixedPayoffGenRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_payoff_input": + return EvoMixedPayoffInputRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_payoff_pred": + return EvoMixedPayoffPredRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_mixed_payoff_only_opp": + return EvoMixedPayoffOnlyOppRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args + ) + elif args.runner == "evo_scanned": + return EvoScannedRunner( + agents, env, strategy, es_params, param_reshaper, save_dir, args ) elif args.runner == "evo_nroles": @@ -433,8 +488,31 @@ def agent_setup(args, env, env_params, logger): else: obs_shape = env.observation_space(env_params).shape + if args.runner in ["evo_mixed_payoff_input"]: + obs_shape_meta = env.observation_space(env_params).n + 8 + else: + obs_shape_meta = obs_shape + + # print(obs_shape, "obs_shape") + num_actions = env.num_actions + def get_Shaper_agent(seed, player_id): + player_args = args.ppo1 if player_id == 1 else args.ppo2 + num_iterations = args.num_iters + if player_id == 1 and args.env_type == "meta": + num_iterations = args.num_outer_steps + return make_shaper_agent( + args, + player_args, + obs_spec=obs_shape_meta, + action_spec=num_actions, + seed=seed, + num_iterations=num_iterations, + player_id=player_id, + ) + + def get_LOLA_agent(seed, player_id): return make_lola( args, @@ -447,6 +525,7 @@ def get_LOLA_agent(seed, player_id): env_reset=env.reset, ) + def get_PPO_memory_agent(seed, player_id): default_player_args = omegaconf.OmegaConf.select( args, "ppo_default", default=None @@ -594,6 +673,7 @@ def get_stay_agent(seed, player_id): "LOLA": get_LOLA_agent, "PPO": get_PPO_agent, "PPO_memory": get_PPO_memory_agent, + "Shaper": get_Shaper_agent, "Naive": get_naive_pg, "Tabular": get_PPO_tabular_agent, "MFOS": get_mfos_agent, @@ -733,6 +813,7 @@ def naive_pg_log(agent): "PPO": ppo_log, "LOLA": dumb_log, "PPO_memory": ppo_memory_log, + "Shaper": ppo_memory_log, "Naive": naive_pg_log, "Hyper": hyper_log, "NaiveEx": naive_logger, @@ -795,7 +876,10 @@ def main(args): print(f"Number of Training Iterations: {args.num_iters}") - if args.runner in ["evo", "evo_nroles", "multishaper_evo"]: + if args.runner in ["evo", "evo_mixed_lr", "evo_hardstop", "evo_mixed_payoff", "evo_mixed_ipd_payoff", + "evo_mixed_payoff_gen", "evo_mixed_payoff_input", "evo_scanned", "evo_mixed_payoff_only_opp", "multishaper_evo", "evo_nroles"]: + print(f"Running {args.runner}") + runner.run_loop(env_params, agent_pair, args.num_iters, watchers) elif args.runner == "rl" or args.runner == "tensor_rl_nplayer": # number of episodes @@ -804,13 +888,8 @@ def main(args): elif args.runner == "ipditm_eval" or args.runner == "multishaper_eval": runner.run_loop(env_params, agent_pair, watchers) - elif args.runner == "sarl": - print(f"Number of Episodes: {args.num_iters}") - runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers) - elif args.runner == "weight_sharing": - print(f"Number of Episodes: {args.num_iters}") - runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers) - elif args.runner == "eval": + + elif args.runner in ["eval", "stevie", "eval_hardstop", "weight_sharing", "sarl"] or args.runner == 'stevie' or args.runner == "eval_hardstop": print(f"Number of Episodes: {args.num_iters}") runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers) diff --git a/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py b/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py new file mode 100644 index 00000000..8c88236e --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py @@ -0,0 +1,671 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedIPDPayoffRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Each opponent has a different payoff matrix that follows the IPD conditions but each member + of the evo population plays against the same payoff matrices to ensure fair comparison. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, 0), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + # set payoff matrix to random integers of shape [4,2] + payoffs = jnp.array([0, 0, 0, 0], dtype=jnp.int8) + def cond_fun(val): + _rng_run, payoffs = val + return 2*payoffs[1] <= (payoffs[0] + payoffs[2]) + def body_fun(val): + _rng_run, payoffs = val + _rng_run, payoff_T, payoff_R, payoff_P, payoff_S = jax.random.split(_rng_run, 5) + T = jax.random.randint(payoff_T, minval=0, maxval=2, shape=(1,), dtype=jnp.int8)[0] + R = jax.random.randint(payoff_R, minval=T, maxval=4, shape=(1,), dtype=jnp.int8)[0] + P = jax.random.randint(payoff_P, minval=R, maxval=6, shape=(1,), dtype=jnp.int8)[0] + S = jax.random.randint(payoff_S, minval=P, maxval=8, shape=(1,), dtype=jnp.int8)[0] + # payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8) + payoffs = jnp.array([T, R, P, S], dtype=jnp.int8) + return (_rng_run, payoffs) + # _rng_run, payoff_T, payoff_R, payoff_P, payoff_S = jax.random.split(_rng_run, 5) + # T = jax.random.randint(payoff_T, minval=0, maxval=2, shape=(1,), dtype=jnp.int8)[0] + # R = jax.random.randint(payoff_R, minval=T, maxval=4, shape=(1,), dtype=jnp.int8)[0] + # P = jax.random.randint(payoff_P, minval=R, maxval=6, shape=(1,), dtype=jnp.int8)[0] + # S = jax.random.randint(payoff_S, minval=P, maxval=8, shape=(1,), dtype=jnp.int8)[0] + _rng_run, payoffs = jax.lax.while_loop(cond_fun, body_fun, (_rng_run, payoffs)) + T = payoffs[0] + R = payoffs[1] + P = payoffs[2] + S = payoffs[3] + payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8) + # payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8) + payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1)) + # jax.debug.breakpoint() + _env_params.payoff_matrix = payoff_matrix + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + # jax.debug.breakpoint() + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/experimental/runner_evo_mixed_lr.py b/pax/runners/experimental/runner_evo_mixed_lr.py new file mode 100644 index 00000000..bb8942f7 --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_lr.py @@ -0,0 +1,642 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedLRRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Each opponent has a different learning rate, but the members of the population + play against the same learning rates to ensure a fair comparison. + + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + # jax.debug.print("env_params: {x}", x=env_params) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [args.num_opps] + random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(args.num_opps,)) + # # repeat the array popsize-times along the first dimension + learning_rates = jnp.tile(random_numbers, (args.popsize, 1)) + a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs.py b/pax/runners/experimental/runner_evo_mixed_payoffs.py new file mode 100644 index 00000000..00a254a2 --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_payoffs.py @@ -0,0 +1,646 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedPayoffRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Payoff matrix is randomly sampled at each rollout. Each opponent has a different payoff matrix. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, 0), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + # set payoff matrix to random integers of shape [4,2] + _rng_run, payoff_rng = jax.random.split(_rng_run) + payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8) + payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1)) + # jax.debug.breakpoint() + _env_params.payoff_matrix = payoff_matrix + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + # jax.debug.breakpoint() + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py b/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py new file mode 100644 index 00000000..f68f9fc6 --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py @@ -0,0 +1,645 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedPayoffGenRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + # set payoff matrix to random integers of shape [4,2] + _rng_run, payoff_rng = jax.random.split(_rng_run) + payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8) + # payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1)) + + _env_params.payoff_matrix = payoff_matrix + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + # jax.debug.breakpoint() + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_input.py b/pax/runners/experimental/runner_evo_mixed_payoffs_input.py new file mode 100644 index 00000000..9601852e --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_payoffs_input.py @@ -0,0 +1,663 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedPayoffInputRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Add payoff matrices as input to agents so they don't have to figure out payoff matrices on the go. + Either randomly sample and set a payoff matrix + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, 0), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + # print("OBS1 shape: ", obs1.shape) + # print("env params shape: ", env_params.payoff_matrix.shape) + # flatten the payoff matrix and append it to the observations + # the observations have shape (500, 10, 2, 5) and the payoff matrix has shape (10, 4, 2) + # we want to append the payoff matrix to the observations so that the observations have shape (500, 10, 2, 5+8) + # we want to flatten the payoff matrix so that it has shape (10, 8) + # This is the code + payoff_matrix = env_params.payoff_matrix.reshape((self.args.num_opps, 8)) + payoff_matrix = jnp.tile(jnp.expand_dims(jnp.tile(payoff_matrix, (self.args.popsize, 1, 1)), 2), (1, 1, 2, 1)) + obs1 = jnp.concatenate((obs1, payoff_matrix), axis=3) + # obs2 = jnp.concatenate((obs2, payoff_matrix), axis=3) + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + # print("OBS2 shape: ", obs2.shape) + # payoff_matrix = env_params.payoff_matrix.reshape((10, 8)) + # payoff_matrix = jnp.tile(jnp.expand_dims(jnp.tile(payoff_matrix, (500, 1, 1)), 2), (1, 1, 2, 1)) + # obs2_update = jnp.concatenate((obs2, payoff_matrix), axis=3) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + # set payoff matrix to random integers of shape [4,2] + _rng_run, payoff_rng = jax.random.split(_rng_run) + # payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8) + payoff_matrix = jnp.array([[-1, -1], [-3, 0], [0, -3], [-2, -2]]) + payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1)) + # jax.debug.breakpoint() + _env_params.payoff_matrix = payoff_matrix + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + # jax.debug.breakpoint() + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py b/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py new file mode 100644 index 00000000..873aeefc --- /dev/null +++ b/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py @@ -0,0 +1,657 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoMixedPayoffOnlyOppRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Opponent plays a noisy payoff function of the original IPD payoff matrix. + Same noise applied to all opponents. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, 0), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, 0), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + # set payoff matrix to random integers of shape [4,2] + _rng_run, payoff_rng = jax.random.split(_rng_run) + # jnp.array([T, R, P, S], dtype=jnp.int8) + payoff_matrix_opp = jax.random.uniform(payoff_rng, minval=-0.5, maxval=0.5, shape=(args.num_envs,4,1)) #, dtype=jnp.int8 + payoff_matrix_ag1 = jnp.expand_dims(jnp.tile(jnp.array([1, 3, 0, 2], dtype=jnp.int8), (args.num_envs,1)), axis=-1) + payoff_matrix_ag2 = payoff_matrix_opp + payoff_matrix_ag1 + payoff_matrix = -jnp.concatenate((payoff_matrix_ag1, payoff_matrix_ag2), axis=-1) + + # payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8) + # payoff_matrix = -jnp.array([[1, payoff_matrix_opp[1]], + # [3, payoff_matrix_opp[0]], + # [0, payoff_matrix_opp[3]], + # [2, payoff_matrix_opp[2]]]) #, dtype=jnp.int8 + payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1, 1)) + + _env_params.payoff_matrix = payoff_matrix + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + # jax.debug.breakpoint() + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/runner_eval.py b/pax/runners/runner_eval.py index fb63f98d..44648dc9 100644 --- a/pax/runners/runner_eval.py +++ b/pax/runners/runner_eval.py @@ -410,6 +410,15 @@ def run_loop(self, env, env_params, agents, num_episodes, watchers): ) = vals traj_1, traj_2, env_states, a2_metrics = stack + # reset second agent memory + a2_mem = agent2.batch_reset(a2_mem, False) + # jax.debug.breakpoint() + traj_1_rewards = traj_1.rewards.mean(axis=(1,3)) + traj_2_rewards = traj_2.rewards.mean(axis=(1,3)) + for i in range(len(traj_1_rewards)): + wandb.log({"r1": traj_1_rewards[i].item()}, step=i) + wandb.log({"r2": traj_2_rewards[i].item()}, step=i) + rewards_1 = jnp.concatenate([traj.rewards for traj in traj_1]) rewards_2 = jnp.concatenate([traj.rewards for traj in traj_2]) diff --git a/pax/runners/runner_eval_hardstop.py b/pax/runners/runner_eval_hardstop.py new file mode 100644 index 00000000..c301d9c5 --- /dev/null +++ b/pax/runners/runner_eval_hardstop.py @@ -0,0 +1,474 @@ +import os +import time +from typing import NamedTuple + +import jax +import jax.numpy as jnp + +import wandb +from pax.utils import load +from pax.watchers import cg_visitation, ipd_visitation + +MAX_WANDB_CALLS = 10000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvalHardstopRunner: + """ + Evaluation runner provides a convenient example for quickly writing + a shaping eval runner for PAX. The EvalRunner class can be used to + run any two agents together either in a meta-game or regular game, it composes together agents, + watchers, and the environment. Within the init, we declare vmaps and pmaps for training. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is important for + logic used in the class. + env (gymnax.envs.Environment): + The environment that the agents will run in. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__(self, agents, env, args): + self.train_episodes = 0 + self.start_time = time.time() + self.args = args + self.num_opps = args.num_opps + self.random_key = jax.random.PRNGKey(args.seed) + self.run_path = args.run_path + self.model_path = args.model_path + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(cg_visitation) + # VMAP for num envs: we vmap over the rng but not params + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # VMAP for num opps: we vmap over the rng but not params + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + + self.split = jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)) + + agent1, agent2 = agents + + if args.agent1 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent1.batch_init = jax.jit(jax.vmap(agent1.make_initial_state)) + else: + # batch MemoryState not TrainingState + agent1.batch_init = jax.vmap( + agent1.make_initial_state, + (None, 0), + (None, 0), + ) + agent1.batch_reset = jax.jit( + jax.vmap(agent1.reset_memory, (0, None), 0), static_argnums=1 + ) + + agent1.batch_policy = jax.jit( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)) + ) + + # batch all for Agent2 + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit(jax.vmap(agent2.make_initial_state)) + else: + agent2.batch_init = jax.vmap( + agent2.make_initial_state, (0, None), 0 + ) + agent2.batch_policy = jax.jit(jax.vmap(agent2._policy)) + agent2.batch_reset = jax.jit( + jax.vmap(agent2.reset_memory, (0, None), 0), static_argnums=1 + ) + agent2.batch_update = jax.jit(jax.vmap(agent2.update, (1, 0, 0, 0), 0)) + + if args.agent1 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent1._mem.hidden, (args.num_opps, 1, 1)) + agent1._state, agent1._mem = agent1.batch_init( + agent1._state.random_key, init_hidden + ) + + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + agent2._state, agent2._mem = agent2.batch_init( + jax.random.split(agent2._state.random_key, args.num_opps), + init_hidden, + ) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, 0, :] + # a1_rng = rngs[:, :, 1, :] + # a2_rng = rngs[:, :, 2, :] + rngs = rngs[:, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=self.args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _outer_rollout_fixed(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=self.args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + _, _, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + self.rollout = jax.jit(_outer_rollout) + self.rollout_fixed = jax.jit(_outer_rollout_fixed) + + def run_loop(self, env, env_params, agents, num_episodes, watchers): + """Run evaluation of agents in environment""" + print("Training") + print("-----------------------") + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + a1_state, a1_mem = agent1._state, agent1._mem + a2_state, a2_mem = agent2._state, agent2._mem + + if watchers: + wandb.restore( + name=self.model_path, run_path=self.run_path, root=os.getcwd() + ) + pretrained_params = load(self.model_path) + a1_state = a1_state._replace(params=pretrained_params) + + num_iters = max( + int(num_episodes / (self.args.num_envs * self.args.num_opps)), 1 + ) + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Log Interval {log_interval}") + + # RNG are the same for num_opps but different for num_envs + rngs = jnp.concatenate( + [jax.random.split(rng, self.args.num_envs)] * self.args.num_opps + ).reshape((self.args.num_opps, self.args.num_envs, -1)) + # run actual loop + print('num episodes', num_episodes) + for i in range(num_episodes): + + obs, env_state = env.reset(rngs, env_params) + rewards = [ + jnp.zeros((self.args.num_opps, self.args.num_envs)), + jnp.zeros((self.args.num_opps, self.args.num_envs)), + ] + + if self.args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + elif self.args.env_type in ["meta"]: + # meta-experiments - init 2nd agent per trial + a2_state, a2_mem = agent2.batch_init( + jax.random.split(rng, self.num_opps), a2_mem.hidden + ) + # run trials + + vals, stack = jax.lax.scan( + self.rollout, + ( + rngs, + *obs, + *rewards, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), + None, + length=self.args.stop, + ) + traj_1, traj_2, a2_metrics = stack + ( + rngs, + _, + _, + _, + _, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + vals, stack = jax.lax.scan( + self.rollout_fixed, + ( + rngs, + *obs, + *rewards, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), + None, + length=(self.args.num_steps // self.args.num_inner_steps)-self.args.stop, + ) + + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + traj_1_fixed, traj_2_fixed, a2_metrics_fixed = stack + + # reset second agent memory + a2_mem = agent2.batch_reset(a2_mem, False) + + # logging + traj_1_rewards = jnp.concatenate([traj_1.rewards, traj_1_fixed.rewards], axis=0) + traj_2_rewards = jnp.concatenate([traj_2.rewards, traj_2_fixed.rewards], axis=0) + traj_1_rewards = traj_1_rewards.mean(axis=(1,3)) + traj_2_rewards = traj_2_rewards.mean(axis=(1,3)) + for i in range(len(traj_1_rewards)): + wandb.log({"r1": traj_1_rewards[i].item()}, step=i) + wandb.log({"r2": traj_2_rewards[i].item()}, step=i) + + self.train_episodes += 1 + if i % log_interval == 0: + print(f"Episode {i}") + if self.args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x.item(), + self.cg_stats(env_state), + ) + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif self.args.env_type in [ + "meta", + "sequential", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.item(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + else: + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + env_stats = {} + + print(f"Env Stats: {env_stats}") + print( + f"Total Episode Reward: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print() + + if watchers: + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + agent2._logger.metrics = ( + agent2._logger.metrics | flattened_metrics + ) + + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb.log( + { + "episodes": self.train_episodes, + "train/episode_reward/player_1": float( + rewards_1.mean() + ), + "train/episode_reward/player_2": float( + rewards_2.mean() + ), + } + | env_stats, + ) + + agents[0]._state = a1_state + agents[1]._state = a2_state + return agents diff --git a/pax/runners/runner_evo.py b/pax/runners/runner_evo.py index 9ce590b0..43c4a0bd 100644 --- a/pax/runners/runner_evo.py +++ b/pax/runners/runner_evo.py @@ -337,6 +337,12 @@ def _rollout( a2_rng, agent2._mem.hidden, ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() # run trials vals, stack = jax.lax.scan( diff --git a/pax/runners/runner_evo_hardstop.py b/pax/runners/runner_evo_hardstop.py new file mode 100644 index 00000000..cb5345fd --- /dev/null +++ b/pax/runners/runner_evo_hardstop.py @@ -0,0 +1,648 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoHardstopRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + counter, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + counter, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + counter, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + # jax.debug.print("Step Size: {x}", x=a2_state.opt_state[2].hyperparams['step_size'][0]) + # jax.debug.print("Counter: {x}", x=counter[0]) + # update second agent + a2_state.opt_state[2].hyperparams['step_size'] = jnp.where(counter <= 0, 0.0, a2_state.opt_state[2].hyperparams['step_size']) + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + counter - 1, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + random_numbers = jax.random.uniform(_rng_run, minval=1, maxval=self.num_outer_steps, shape=(10,)) + # repeat the array 1000 times along the first dimension + counter = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + counter, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + counter, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + for gen in range(num_gens): + rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4) + + # Ask + x, evo_state = strategy.ask(rng_evo, evo_state, es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + # Evo Rollout + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + + # Aggregate over devices + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + + # Tell + fitness_re = fit_shaper.apply(x, fitness) + + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + + # Logging + log = es_logging.update(log, x, fitness) + + # Saving + if gen % self.args.save_interval == 0: + log_savepath = os.path.join(self.save_dir, f"generation_{gen}") + if self.args.num_devices > 1: + top_params = param_reshaper.reshape( + log["top_gen_params"][0 : self.args.num_devices] + ) + top_params = jax.tree_util.tree_map( + lambda x: x[0].reshape(x[0].shape[1:]), top_params + ) + else: + top_params = param_reshaper.reshape( + log["top_gen_params"][0:1] + ) + top_params = jax.tree_util.tree_map( + lambda x: x.reshape(x.shape[1:]), top_params + ) + save(top_params, log_savepath) + if watchers: + print(f"Saving generation {gen} locally and to WandB") + wandb.save(log_savepath) + else: + print(f"Saving iteration {gen} locally") + + if gen % log_interval == 0: + print(f"Generation: {gen}") + print( + "--------------------------------------------------------------------------" + ) + print( + f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}" + ) + print( + f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print( + f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}" + ) + print( + "--------------------------------------------------------------------------" + ) + print( + f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}" + f" | Std: {log['log_top_gen_std'][gen]}" + ) + print( + "--------------------------------------------------------------------------" + ) + print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}") + print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}") + print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}") + print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}") + print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}") + print() + + if watchers: + wandb_log = { + "train_iteration": gen, + "train/fitness/player_1": float(fitness.mean()), + "train/fitness/player_2": float(other_fitness.mean()), + "train/fitness/top_overall_mean": log["log_top_mean"][gen], + "train/fitness/top_overall_std": log["log_top_std"][gen], + "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen], + "train/fitness/top_gen_std": log["log_top_gen_std"][gen], + "train/fitness/gen_std": log["log_gen_std"][gen], + "train/time/minutes": float( + (time.time() - self.start_time) / 60 + ), + "train/time/seconds": float( + (time.time() - self.start_time) + ), + "train/reward_per_timestep/player_1": float( + rewards_1.mean() + ), + "train/reward_per_timestep/player_2": float( + rewards_2.mean() + ), + } + wandb_log.update(env_stats) + # loop through population + for idx, (overall_fitness, gen_fitness) in enumerate( + zip(log["top_fitness"], log["top_gen_fitness"]) + ): + wandb_log[ + f"train/fitness/top_overall_agent_{idx+1}" + ] = overall_fitness + wandb_log[ + f"train/fitness/top_gen_agent_{idx+1}" + ] = gen_fitness + + # player 2 metrics + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + + agent2._logger.metrics.update(flattened_metrics) + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb_log = jax.tree_util.tree_map( + lambda x: x.item() if isinstance(x, jax.Array) else x, + wandb_log, + ) + wandb.log(wandb_log) + + return agents diff --git a/pax/runners/runner_evo_scanned.py b/pax/runners/runner_evo_scanned.py new file mode 100644 index 00000000..e22b73bb --- /dev/null +++ b/pax/runners/runner_evo_scanned.py @@ -0,0 +1,528 @@ +import os +import time +from datetime import datetime +from typing import Any, Callable, NamedTuple + +import jax +import jax.numpy as jnp +from evosax import FitnessShaper + +import wandb +from pax.utils import MemoryState, TrainingState, save + +# TODO: import when evosax library is updated +# from evosax.utils import ESLog +from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats + +MAX_WANDB_CALLS = 1000 + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class EvoScannedRunner: + """ + Evoluationary Strategy runner provides a convenient example for quickly writing + a MARL runner for PAX. The EvoRunner class can be used to + run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner. + It composes together agents, watchers, and the environment. + Within the init, we declare vmaps and pmaps for training. + The environment provided must conform to a meta-environment. + This runner also scans over the evolutionary step, which leads to longer compilation time, + shorter run time and logging is not possible. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is + important for logic used in the class. + env (gymnax.envs.Environment): + The meta-environment that the agents will run in. + strategy (evosax.Strategy): + The evolutionary strategy that will be used to train the agents. + param_reshaper (evosax.param_reshaper.ParameterReshaper): + A function that reshapes the parameters of the agents into a format that can be + used by the strategy. + save_dir (string): + The directory to save the model to. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__( + self, agents, env, strategy, es_params, param_reshaper, save_dir, args + ): + self.args = args + self.algo = args.es.algo + self.es_params = es_params + self.generations = 0 + self.num_opps = args.num_opps + self.param_reshaper = param_reshaper + self.popsize = args.popsize + self.random_key = jax.random.PRNGKey(args.seed) + self.start_datetime = datetime.now() + self.save_dir = save_dir + self.start_time = time.time() + self.strategy = strategy + self.top_k = args.top_k + self.train_steps = 0 + self.train_episodes = 0 + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(jax.vmap(cg_visitation)) + self.ipditm_stats = jax.jit( + jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None)) + ) + + # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs) + # Evo Runner also has an additional pmap dim (num_devices, ...) + # For the env we vmap over the rng but not params + + # num envs + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # num opps + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + # pop size + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + self.split = jax.vmap( + jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)), + (0, None), + ) + + self.num_outer_steps = args.num_outer_steps + agent1, agent2 = agents + + # vmap agents accordingly + # agent 1 is batched over popsize and num_opps + agent1.batch_init = jax.vmap( + jax.vmap( + agent1.make_initial_state, + (None, 0), # (params, rng) + (None, 0), # (TrainingState, MemoryState) + ), + # both for Population + ) + agent1.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent1.batch_policy = jax.jit( + jax.vmap( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)), + ) + ) + + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit( + jax.vmap(jax.vmap(agent2.make_initial_state)) + ) + else: + agent2.batch_init = jax.jit( + jax.vmap( + jax.vmap(agent2.make_initial_state, (0, None), 0), + (0, None), + 0, + ) + ) + + agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0))) + agent2.batch_reset = jax.jit( + jax.vmap( + jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0 + ), + static_argnums=1, + ) + + agent2.batch_update = jax.jit( + jax.vmap( + jax.vmap(agent2.update, (1, 0, 0, 0)), + (1, 0, 0, 0), + ) + ) + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + + a2_rng = jnp.concatenate( + [jax.random.split(agent2._state.random_key, args.num_opps)] + * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + + agent2._state, agent2._mem = agent2.batch_init( + a2_rng, + init_hidden, + ) + + # jit evo + strategy.ask = jax.jit(strategy.ask) + strategy.tell = jax.jit(strategy.tell) + param_reshaper.reshape = jax.jit(param_reshaper.reshape) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, :, 0, :] + + # a1_rng = rngs[:, :, :, 1, :] + # a2_rng = rngs[:, :, :, 2, :] + rngs = rngs[:, :, :, 3, :] + + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + ), (*trajectories, a2_metrics) + + def _rollout( + _params: jnp.ndarray, + _rng_run: jnp.ndarray, + _a1_state: TrainingState, + _a1_mem: MemoryState, + _env_params: Any, + ): + # env reset + env_rngs = jnp.concatenate( + [jax.random.split(_rng_run, args.num_envs)] + * args.num_opps + * args.popsize + ).reshape((args.popsize, args.num_opps, args.num_envs, -1)) + + obs, env_state = env.reset(env_rngs, _env_params) + rewards = [ + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + jnp.zeros((args.popsize, args.num_opps, args.num_envs)), + ] + + # Player 1 + _a1_state = _a1_state._replace(params=_params) + _a1_mem = agent1.batch_reset(_a1_mem, False) + # Player 2 + if args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + + else: + # meta-experiments - init 2nd agent per trial + a2_rng = jnp.concatenate( + [jax.random.split(_rng_run, args.num_opps)] * args.popsize + ).reshape(args.popsize, args.num_opps, -1) + a2_state, a2_mem = agent2.batch_init( + a2_rng, + agent2._mem.hidden, + ) + # generate an array of shape [10] + # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,)) + # # repeat the array 1000 times along the first dimension + # learning_rates = jnp.tile(random_numbers, (1000, 1)) + # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates + # jax.debug.breakpoint() + + # run trials + vals, stack = jax.lax.scan( + _outer_rollout, + ( + env_rngs, + *obs, + *rewards, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ), + None, + length=self.num_outer_steps, + ) + + ( + env_rngs, + obs1, + obs2, + r1, + r2, + _a1_state, + _a1_mem, + a2_state, + a2_mem, + env_state, + _env_params, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # Fitness + fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4)) + other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4)) + # Stats + if args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x, + self.cg_stats(env_state), + ) + + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif args.env_id in [ + "iterated_matrix_game", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + elif args.env_id == "InTheMatrix": + env_stats = jax.tree_util.tree_map( + lambda x: x.mean(), + self.ipditm_stats( + env_state, + traj_1, + traj_2, + args.num_envs, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + else: + env_stats = {} + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + return ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) + + self.rollout = jax.pmap( + _rollout, + in_axes=(0, None, None, None, None), + ) + + print( + f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds" + ) + + def run_loop( + self, + env_params, + agents, + num_iters: int, + watchers: Callable, + ): + """Run training of agents in environment""" + print("Training") + print("------------------------------") + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Number of Generations: {num_iters}") + print(f"Number of Meta Episodes: {self.num_outer_steps}") + print(f"Population Size: {self.popsize}") + print(f"Number of Environments: {self.args.num_envs}") + print(f"Number of Opponent: {self.args.num_opps}") + print(f"Log Interval: {log_interval}") + print("------------------------------") + # Initialize agents and RNG + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + # Initialize evolution + num_gens = num_iters + strategy = self.strategy + es_params = self.es_params + param_reshaper = self.param_reshaper + popsize = self.popsize + num_opps = self.num_opps + evo_state = strategy.initialize(rng, es_params) + fit_shaper = FitnessShaper( + maximize=self.args.es.maximise, + centered_rank=self.args.es.centered_rank, + w_decay=self.args.es.w_decay, + z_score=self.args.es.z_score, + ) + es_logging = ESLog( + param_reshaper.total_params, + num_gens, + top_k=self.top_k, + maximize=True, + ) + log = es_logging.initialize() + + # Reshape a single agent's params before vmapping + init_hidden = jnp.tile( + agent1._mem.hidden, + (popsize, num_opps, 1, 1), + ) + a1_rng = jax.random.split(rng, popsize) + agent1._state, agent1._mem = agent1.batch_init( + a1_rng, + init_hidden, + ) + + a1_state, a1_mem = agent1._state, agent1._mem + + def es_step(state_input, tmp): + rng, rng_run, rng_evo, rng_key = jax.random.split(state_input[0], 4) + x, evo_state = strategy.ask(rng_evo, state_input[1], es_params) + params = param_reshaper.reshape(x) + if self.args.num_devices == 1: + params = jax.tree_util.tree_map( + lambda x: jax.lax.expand_dims(x, (0,)), params + ) + ( + fitness, + other_fitness, + env_stats, + rewards_1, + rewards_2, + a2_metrics, + ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params) + fitness = jnp.reshape(fitness, popsize * self.args.num_devices) + env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats) + fitness_re = fit_shaper.apply(x, fitness) + if self.args.es.mean_reduce: + fitness_re = fitness_re - fitness_re.mean() + evo_state = strategy.tell(x, fitness_re, evo_state, es_params) + return (rng, evo_state), (fitness, other_fitness, env_stats, rewards_1, rewards_2, a2_metrics) + + state_input = (rng, evo_state) + _, scan_output = jax.lax.scan(es_step, state_input, None, length=num_gens) + + return agents diff --git a/pax/runners/runner_stevie.py b/pax/runners/runner_stevie.py new file mode 100644 index 00000000..9b8f333e --- /dev/null +++ b/pax/runners/runner_stevie.py @@ -0,0 +1,413 @@ +import os +import time +from typing import NamedTuple + +import jax +import jax.numpy as jnp + +import wandb +from pax.utils import load +from pax.watchers import cg_visitation, ipd_visitation + +MAX_WANDB_CALLS = 10000 +NUM_ENVS = 10 + + + +class Sample(NamedTuple): + """Object containing a batch of data""" + + observations: jnp.ndarray + actions: jnp.ndarray + rewards: jnp.ndarray + behavior_log_probs: jnp.ndarray + behavior_values: jnp.ndarray + dones: jnp.ndarray + hiddens: jnp.ndarray + + +class StevieRunner: + """ + Runner in which a small number of the principal agent is unable to see what is happening. + Args: + agents (Tuple[agents]): + The set of agents that will run in the experiment. Note, ordering is important for + logic used in the class. + env (gymnax.envs.Environment): + The environment that the agents will run in. + args (NamedTuple): + A tuple of experiment arguments used (usually provided by HydraConfig). + """ + + def __init__(self, agents, env, args): + self.train_episodes = 0 + self.start_time = time.time() + self.args = args + self.num_opps = args.num_opps + self.random_key = jax.random.PRNGKey(args.seed) + self.run_path = args.run_path + self.model_path = args.model_path + self.ipd_stats = jax.jit(ipd_visitation) + self.cg_stats = jax.jit(cg_visitation) + # VMAP for num envs: we vmap over the rng but not params + env.reset = jax.vmap(env.reset, (0, None), 0) + env.step = jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + + # VMAP for num opps: we vmap over the rng but not params + env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0)) + env.step = jax.jit( + jax.vmap( + env.step, (0, 0, 0, None), 0 # rng, state, actions, params + ) + ) + + self.split = jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)) + + agent1, agent2 = agents + + if args.agent1 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent1.batch_init = jax.jit(jax.vmap(agent1.make_initial_state)) + else: + # batch MemoryState not TrainingState + agent1.batch_init = jax.vmap( + agent1.make_initial_state, + (None, 0), + (None, 0), + ) + agent1.batch_reset = jax.jit( + jax.vmap(agent1.reset_memory, (0, None), 0), static_argnums=1 + ) + + agent1.batch_policy = jax.jit( + jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)) + ) + + # batch all for Agent2 + if args.agent2 == "NaiveEx": + # special case where NaiveEx has a different call signature + agent2.batch_init = jax.jit(jax.vmap(agent2.make_initial_state)) + else: + agent2.batch_init = jax.vmap( + agent2.make_initial_state, (0, None), 0 + ) + agent2.batch_policy = jax.jit(jax.vmap(agent2._policy)) + agent2.batch_reset = jax.jit( + jax.vmap(agent2.reset_memory, (0, None), 0), static_argnums=1 + ) + agent2.batch_update = jax.jit(jax.vmap(agent2.update, (1, 0, 0, 0), 0)) + + if args.agent1 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent1._mem.hidden, (args.num_opps, 1, 1)) + agent1._state, agent1._mem = agent1.batch_init( + agent1._state.random_key, init_hidden + ) + + if args.agent2 != "NaiveEx": + # NaiveEx requires env first step to init. + init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1)) + agent2._state, agent2._mem = agent2.batch_init( + jax.random.split(agent2._state.random_key, args.num_opps), + init_hidden, + ) + + + # BLIND_IDX = [] #For Timon to crank up that conspiracy + # BLIND_MASK = jnp.array( + # [[[0,0,0,0,0] if idx in BLIND_IDX else [1,1,1,1,1] for idx in range(args.num_envs)]], + # dtype=jnp.int8) + + # NOT_BLIND_MASK = jnp.logical_not(BLIND_MASK) + BLIND_STATE = jnp.tile(jnp.array([0, 0, 0, 0, 1]), (args.num_opps, args.num_envs, 1)) + + def _inner_rollout(carry, unused): + """Runner for inner episode""" + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ) = carry + + # unpack rngs + rngs = self.split(rngs, 4) + env_rng = rngs[:, :, 0, :] + # a1_rng = rngs[:, :, 1, :] + + # a2_rng = rngs[:, :, 2, :] + rngs = rngs[:, :, 3, :] + obs1 = BLIND_MASK * obs1 + NOT_BLIND_MASK*BLIND_STATE + a1, a1_state, new_a1_mem = agent1.batch_policy( + a1_state, + obs1, + a1_mem, + ) + a2, a2_state, new_a2_mem = agent2.batch_policy( + a2_state, + obs2, + a2_mem, + ) + (next_obs1, next_obs2), env_state, rewards, done, info = env.step( + env_rng, + env_state, + (a1, a2), + env_params, + ) + + traj1 = Sample( + obs1, + a1, + rewards[0], + new_a1_mem.extras["log_probs"], + new_a1_mem.extras["values"], + done, + a1_mem.hidden, + ) + traj2 = Sample( + obs2, + a2, + rewards[1], + new_a2_mem.extras["log_probs"], + new_a2_mem.extras["values"], + done, + a2_mem.hidden, + ) + return ( + rngs, + next_obs1, + next_obs2, + rewards[0], + rewards[1], + a1_state, + new_a1_mem, + a2_state, + new_a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ), ( + traj1, + traj2, + ) + + def _outer_rollout(carry, unused): + """Runner for trial""" + # play episode of the game + vals, trajectories = jax.lax.scan( + _inner_rollout, + carry, + None, + length=self.args.num_inner_steps, + ) + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ) = vals + # MFOS has to take a meta-action for each episode + if args.agent1 == "MFOS": + a1_mem = agent1.meta_policy(a1_mem) + + # update second agent + a2_state, a2_mem, a2_metrics = agent2.batch_update( + trajectories[1], + obs2, + a2_state, + a2_mem, + ) + return ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ), (*trajectories, a2_metrics) + + self.rollout = jax.jit(_outer_rollout) + + def run_loop(self, env, env_params, agents, num_episodes, watchers): + """Run evaluation of agents in environment""" + print("Eval") + print("-----------------------") + for s in range(self.args.num_envs): + print(f"Number of blind dims: {s}") + BLIND_IDX = jnp.arange(s) #For Timon to crank up that conspiracy + BLIND_MASK = jnp.array( + [[[0,0,0,0,0] if idx in BLIND_IDX else [1,1,1,1,1] for idx in range(self.args.num_envs)]], + dtype=jnp.int8) + + NOT_BLIND_MASK = jnp.logical_not(BLIND_MASK) + agent1, agent2 = agents + rng, _ = jax.random.split(self.random_key) + + a1_state, a1_mem = agent1._state, agent1._mem + a2_state, a2_mem = agent2._state, agent2._mem + + if watchers: + wandb.restore( + name=self.model_path, run_path=self.run_path, root=os.getcwd() + ) + pretrained_params = load(self.model_path) + a1_state = a1_state._replace(params=pretrained_params) + + num_iters = max( + int(num_episodes / (self.args.num_envs * self.args.num_opps)), 1 + ) + log_interval = max(num_iters / MAX_WANDB_CALLS, 5) + print(f"Log Interval {log_interval}") + + # RNG are the same for num_opps but different for num_envs + rngs = jnp.concatenate( + [jax.random.split(rng, self.args.num_envs)] * self.args.num_opps + ).reshape((self.args.num_opps, self.args.num_envs, -1)) + # run actual loop + for i in range(num_episodes): + obs, env_state = env.reset(rngs, env_params) + rewards = [ + jnp.zeros((self.args.num_opps, self.args.num_envs)), + jnp.zeros((self.args.num_opps, self.args.num_envs)), + ] + + if self.args.agent2 == "NaiveEx": + a2_state, a2_mem = agent2.batch_init(obs[1]) + elif self.args.env_type in ["meta"]: + # meta-experiments - init 2nd agent per trial + a2_state, a2_mem = agent2.batch_init( + jax.random.split(rng, self.num_opps), a2_mem.hidden + ) + # run trials + vals, stack = jax.lax.scan( + self.rollout, + ( + rngs, + *obs, + *rewards, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ), + None, + length=self.args.num_steps // self.args.num_inner_steps, + ) + + ( + rngs, + obs1, + obs2, + r1, + r2, + a1_state, + a1_mem, + a2_state, + a2_mem, + env_state, + env_params, + BLIND_MASK, + NOT_BLIND_MASK, + ) = vals + traj_1, traj_2, a2_metrics = stack + + # reset second agent memory + a2_mem = agent2.batch_reset(a2_mem, False) + # jax.debug.breakpoint() + # logging + self.train_episodes += 1 + if i % log_interval == 0: + print(f"Episode {i}") + if self.args.env_id == "coin_game": + env_stats = jax.tree_util.tree_map( + lambda x: x.item(), + self.cg_stats(env_state), + ) + rewards_1 = traj_1.rewards.sum(axis=1).mean() + rewards_2 = traj_2.rewards.sum(axis=1).mean() + + elif self.args.env_type in [ + "meta", + "sequential", + ]: + env_stats = jax.tree_util.tree_map( + lambda x: x.item(), + self.ipd_stats( + traj_1.observations, + traj_1.actions, + obs1, + ), + ) + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + + else: + rewards_1 = traj_1.rewards.mean() + rewards_2 = traj_2.rewards.mean() + env_stats = {} + + print(f"Env Stats: {env_stats}") + print( + f"Total Episode Reward: {float(rewards_1.mean()), float(rewards_2.mean())}" + ) + print() + + if watchers: + # metrics [outer_timesteps, num_opps] + flattened_metrics = jax.tree_util.tree_map( + lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics + ) + agent2._logger.metrics = ( + agent2._logger.metrics | flattened_metrics + ) + + for watcher, agent in zip(watchers, agents): + watcher(agent) + wandb.log( + { + "episodes": s, + "train/episode_reward/player_1": float( + rewards_1.mean() + ), + "train/episode_reward/player_2": float( + rewards_2.mean() + ), + } + | env_stats, + ) + + agents[0]._state = a1_state + agents[1]._state = a2_state + return agents diff --git a/stevie_bash.sh b/stevie_bash.sh new file mode 100755 index 00000000..1f1cfa89 --- /dev/null +++ b/stevie_bash.sh @@ -0,0 +1,70 @@ +#!/bin/bash +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_4 ++wandb.log=True + + +###### MFOS NOTHING ###### +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_4 ++wandb.log=True + +###### SHAPER AVG ###### +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_2 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_2 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_2 ++wandb.log=True + +###### SHAPER NOTHING ###### +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_4 ++wandb.log=True + +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_0 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_1 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_2 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_3 ++wandb.log=True +python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_4 ++wandb.log=True \ No newline at end of file diff --git a/test/runners/test_runners.py b/test/runners/test_runners.py index c74dc3b1..11e96bad 100644 --- a/test/runners/test_runners.py +++ b/test/runners/test_runners.py @@ -73,3 +73,39 @@ def test_runner_marl_nplayer(): _test_runner( ["+experiment/multiplayer_ipd=lola_vs_ppo_ipd", "++num_inner_steps=10"] ) + + +def test_runner_evo_hardstop(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_hardstop"]) + + +def test_runner_evo_mixed_rl(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_lr"]) + + +def test_runner_evo_mixed_payoff(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff"]) + + +def test_runner_evo_mixed_ipd_payoff(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_ipd_payoff"]) + + +def test_runner_evo_mixed_payoff_gen(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_gen"]) + + +def test_runner_evo_mixed_payoff_input(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_input"]) + + +def test_runner_evo_mixed_payoff_input(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_input"]) + + +def test_runner_evo_scanned(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_scanned"]) + + +def test_runner_evo_mixed_payoff_only_opp(): + _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_only_opp"]) \ No newline at end of file