EdanToledo · EdanToledo · May 7, 2024 · May 4, 2024 · May 5, 2024 · May 6, 2024
diff --git a/README.md b/README.md
@@ -63,6 +63,7 @@ Stoix currently offers the following building blocks for Single-Agent RL researc
 - **Proximal Policy Optimization (PPO)** - [Paper](https://arxiv.org/abs/1707.06347)
 - **Discovered Policy Optimization (DPO)** [Paper](https://arxiv.org/abs/2210.05639)
 - **Maximum a Posteriori Policy Optimisation (MPO)** - [Paper](https://arxiv.org/abs/1806.06920)
+- **On-Policy Maximum a Posteriori Policy Optimisation (V-MPO)** - [Paper](https://arxiv.org/abs/1909.12238)
 - **Advantage-Weighted Regression (AWR)** - [Paper](https://arxiv.org/abs/1910.00177)
 - **AlphaZero** - [Paper](https://arxiv.org/abs/1712.01815)
 - **MuZero** - [Paper](https://arxiv.org/abs/1911.08265)

diff --git a/stoix/configs/default_ff_vmpo.yaml b/stoix/configs/default_ff_vmpo.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - logger: base_logger
+  - arch: anakin
+  - system: ff_vmpo
+  - network: mlp
+  - env: gymnax/cartpole
+  - _self_
diff --git a/stoix/configs/default_ff_vmpo_continuous.yaml b/stoix/configs/default_ff_vmpo_continuous.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - logger: base_logger
+  - arch: anakin
+  - system: ff_vmpo_continuous
+  - network: mlp_continuous
+  - env: brax/ant
+  - _self_
diff --git a/stoix/configs/system/ff_vmpo.yaml b/stoix/configs/system/ff_vmpo.yaml
@@ -0,0 +1,22 @@
+# --- Defaults FF-VMPO Discrete---
+
+system_name: ff_vmpo # Name of the system.
+
+# --- RL hyperparameters ---
+rollout_length: 32 # Number of environment steps per vectorised environment.
+epochs: 16 # Number of sgd steps per rollout.
+actor_lr: 3e-4  # the learning rate of the policy network optimizer
+critic_lr: 3e-4  # the learning rate of the critic network network optimizer
+dual_lr: 1e-2  # the learning rate of the alpha optimizer
+actor_target_period: 50 # number of online network updates before updating the target network
+gamma: 0.99  # discount factor
+max_grad_norm: 1.0 # Maximum norm of the gradients for a weight update.
+decay_learning_rates: False # Whether learning rates should be linearly decayed during training.
+max_abs_reward : 20_000  # maximum absolute reward value
+epsilon: 0.5 # KL constraint on the non-parametric auxiliary policy, the one associated with the dual variable called temperature.
+epsilon_policy : 0.001 #  KL constraint on the categorical policy, the one associated with the dual variable called alpha.
+init_log_temperature: 3. # initial value for the temperature in log-space, note a softplus (rather than an exp) will be used to transform this.
+init_log_alpha: 3. # initial value for the alpha value in log-space, note a softplus (rather than an exp) will be used to transform this.
+use_n_step_bootstrap : False # whether to use n-step bootstrapping for the value function targets.
+n_step_for_sequence_bootstrap : 5 # the number of steps to use for the sequence bootstrap. This is only used if use_n_step_bootstrap is True.
+gae_lambda : 0.95 # the GAE lambda parameter. This is only used if use_n_step_bootstrap is False.
diff --git a/stoix/configs/system/ff_vmpo_continuous.yaml b/stoix/configs/system/ff_vmpo_continuous.yaml
@@ -0,0 +1,25 @@
+# --- Defaults FF-VMPO Continuous---
+
+system_name: ff_vmpo # Name of the system.
+
+# --- RL hyperparameters ---
+rollout_length: 32 # Number of environment steps per vectorised environment.
+epochs: 16 # Number of sgd steps per rollout.
+actor_lr: 3e-4  # the learning rate of the policy network optimizer
+critic_lr: 3e-4  # the learning rate of the critic network network optimizer
+dual_lr: 1e-2  # the learning rate of the alpha optimizer
+actor_target_period: 50 # number of online network updates before updating the target network
+gamma: 0.99  # discount factor
+max_grad_norm: 0.5 # Maximum norm of the gradients for a weight update.
+decay_learning_rates: False # Whether learning rates should be linearly decayed during training.
+max_abs_reward : 20_000  # maximum absolute reward value
+epsilon: 0.05 # KL constraint on the non-parametric auxiliary policy, the one associated with the dual variable called temperature.
+epsilon_mean: 0.05 # KL constraint on the mean of the Gaussian policy, the one associated with the dual variable called alpha_mean.
+epsilon_stddev: 0.0005 # KL constraint on the stddev of the Gaussian policy, the one associated with the dual variable called alpha_mean.
+init_log_temperature: 10. # initial value for the temperature in log-space, note a softplus (rather than an exp) will be used to transform this.
+init_log_alpha_mean: 10 # initial value for the alpha_mean in log-space, note a softplus (rather than an exp) will be used to transform this.
+init_log_alpha_stddev: 500 # initial value for the alpha_stddev in log-space, note a softplus (rather than an exp) will be used to transform this.
+per_dim_constraining: True # whether to enforce the KL constraint on each dimension independently; this is the default. Otherwise the overall KL is constrained, which allows some dimensions to change more at the expense of others staying put.
+use_n_step_bootstrap : False # whether to use n-step bootstrapping for the value function targets.
+n_step_for_sequence_bootstrap : 10 # the number of steps to use for the sequence bootstrap. This is only used if use_n_step_bootstrap is True.
+gae_lambda : 0.95 # the GAE lambda parameter. This is only used if use_n_step_bootstrap is False.
diff --git a/stoix/systems/mpo/ff_mpo.py b/stoix/systems/mpo/ff_mpo.py
@@ -214,18 +214,18 @@ def _q_loss_fn(
                 target_q_params: FrozenDict,
                 online_actor_params: FrozenDict,
                 target_actor_params: FrozenDict,
-                sequences: SequenceStep,
+                sequence: SequenceStep,
                 rng_key: chex.PRNGKey,
             ) -> jnp.ndarray:
 
                 online_actor_policy = actor_apply_fn(
-                    online_actor_params, sequences.obs
+                    online_actor_params, sequence.obs
                 )  # [B, T, ...]
                 target_actor_policy = actor_apply_fn(
-                    target_actor_params, sequences.obs
+                    target_actor_params, sequence.obs
                 )  # [B, T, ...]
-                a_t = jax.nn.one_hot(sequences.action, config.system.action_dim)  # [B, T, ...]
-                online_q_t = q_apply_fn(online_q_params, sequences.obs, a_t)  # [B, T]
+                a_t = jax.nn.one_hot(sequence.action, config.system.action_dim)  # [B, T, ...]
+                online_q_t = q_apply_fn(online_q_params, sequence.obs, a_t)  # [B, T]
 
                 # Cast and clip rewards.
                 discount = 1.0 - sequence.done.astype(jnp.float32)
@@ -254,7 +254,7 @@ def _q_loss_fn(
 
                 # Compute the Q-values for the next state-action pairs; [N, B, T].
                 q_values = jax.vmap(q_apply_fn, in_axes=(None, None, 0))(
-                    target_q_params, sequences.obs, a_evaluation
+                    target_q_params, sequence.obs, a_evaluation
                 )
 
                 # When policy_eval_stochastic == True, this corresponds to expected SARSA.
@@ -263,10 +263,10 @@ def _q_loss_fn(
 
                 if config.system.use_retrace:
                     # Compute the log-rhos for the retrace targets.
-                    log_rhos = target_actor_policy.log_prob(sequences.action) - sequences.log_prob
+                    log_rhos = target_actor_policy.log_prob(sequence.action) - sequence.log_prob
 
                     # Compute target Q-values
-                    target_q_t = q_apply_fn(target_q_params, sequences.obs, a_t)  # [B, T]
+                    target_q_t = q_apply_fn(target_q_params, sequence.obs, a_t)  # [B, T]
 
                     # Compute retrace targets.
                     # These targets use the rewards and discounts as in normal TD-learning but