Merge pull request #68 from EdanToledo/EdanToledo/issue66

chore: Make Update Batch Size not affect num envs, buffer size and batch size
EdanToledo · May 1, 2024 · 49ded3e · 49ded3e
2 parents 1a932d9 + eb3a3a0
commit 49ded3e
Show file tree

Hide file tree

Showing 49 changed files with 335 additions and 278 deletions.
diff --git a/stoix/configs/arch/anakin.yaml b/stoix/configs/arch/anakin.yaml
@@ -2,7 +2,8 @@
 
 # --- Training ---
 seed: 42  # RNG seed.
-total_num_envs: 1024  # Total Number of vectorised environments. Needs to be divisible by number of devices.
+update_batch_size: 1 # Number of vectorised gradient updates per device.
+total_num_envs: 1024  # Total Number of vectorised environments across all devices and batched_updates. Needs to be divisible by n_devices*update_batch_size.
 total_timesteps: 1e7 # Set the total environment steps.
 # If unspecified, it's derived from num_updates; otherwise, num_updates adjusts based on this value.
 num_updates: ~ # Number of updates

diff --git a/stoix/configs/system/ff_awr.yaml b/stoix/configs/system/ff_awr.yaml
@@ -3,13 +3,12 @@
 system_name: ff_awr # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 2 # Number of environment steps per vectorised environment.
 num_actor_steps: 100 # Number of sgd steps for the actor per rollout.
 num_critic_steps: 20 # Number of sgd steps for the critic per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 sample_sequence_length: 16 # Number of steps to consider for each element of the batch.
 period : 1 # Period of the sampled sequences.
 actor_lr: 5e-5  # the learning rate of the policy network optimizer

diff --git a/stoix/configs/system/ff_az.yaml b/stoix/configs/system/ff_az.yaml
@@ -7,12 +7,11 @@ system_name: ff_az # Name of the system.
 # --- RL hyperparameters ---
 actor_lr: 3e-4 # Learning rate for actor network
 critic_lr: 3e-4 # Learning rate for critic network
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 16 # Number of environment steps per vectorised environment.
 epochs: 8 # Number of epochs per training data batch.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 sample_sequence_length: 16 # Number of steps to consider for each element of the batch.
 period : 1 # Period of the sampled sequences.
 gamma: 0.99 # Discounting factor.

diff --git a/stoix/configs/system/ff_c51.yaml b/stoix/configs/system/ff_c51.yaml
@@ -3,12 +3,11 @@
 system_name: ff_c51 # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 q_lr: 1e-4  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks
 gamma: 0.99  # discount factor

diff --git a/stoix/configs/system/ff_d4pg.yaml b/stoix/configs/system/ff_d4pg.yaml
@@ -3,12 +3,11 @@
 system_name: ff_d4pg # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 32  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 actor_lr: 3e-4  # the learning rate of the policy network optimizer
 q_lr: 3e-4  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks

diff --git a/stoix/configs/system/ff_ddpg.yaml b/stoix/configs/system/ff_ddpg.yaml
@@ -3,12 +3,11 @@
 system_name: ff_ddpg # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 32  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 actor_lr: 3e-4  # the learning rate of the policy network optimizer
 q_lr: 3e-4  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks

diff --git a/stoix/configs/system/ff_dpo.yaml b/stoix/configs/system/ff_dpo.yaml
@@ -5,7 +5,6 @@ system_name: ff_dpo # Name of the system.
 # --- RL hyperparameters ---
 actor_lr: 3e-4 # Learning rate for actor network
 critic_lr: 3e-4 # Learning rate for critic network
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 16 # Number of environment steps per vectorised environment.
 epochs: 4 # Number of ppo epochs per training data batch.
 num_minibatches: 16 # Number of minibatches per ppo epoch.

diff --git a/stoix/configs/system/ff_dqn.yaml b/stoix/configs/system/ff_dqn.yaml
@@ -3,12 +3,11 @@
 system_name: ff_dqn # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 q_lr: 1e-4  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks
 gamma: 0.99  # discount factor

diff --git a/stoix/configs/system/ff_dqn_reg.yaml b/stoix/configs/system/ff_dqn_reg.yaml
@@ -3,12 +3,11 @@
 system_name: ff_dqn_reg # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 q_lr: 1e-5  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks
 gamma: 0.99  # discount factor

diff --git a/stoix/configs/system/ff_mdqn.yaml b/stoix/configs/system/ff_mdqn.yaml
@@ -3,12 +3,11 @@
 system_name: ff_mdqn # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 q_lr: 1e-4  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks
 gamma: 0.99  # discount factor

diff --git a/stoix/configs/system/ff_mpo.yaml b/stoix/configs/system/ff_mpo.yaml
@@ -3,12 +3,11 @@
 system_name: ff_mpo # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 8 # Number of environment steps per vectorised environment.
 epochs: 16 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 sample_sequence_length: 8 # Number of steps to consider for each element of the batch.
 period : 1 # Period of the sampled sequences.
 actor_lr: 1e-4  # the learning rate of the policy network optimizer

diff --git a/stoix/configs/system/ff_mpo_continuous.yaml b/stoix/configs/system/ff_mpo_continuous.yaml
@@ -3,12 +3,11 @@
 system_name: ff_mpo # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 8 # Number of environment steps per vectorised environment.
 epochs: 32 # Number of sgd steps per rollout.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 sample_sequence_length: 8 # Number of steps to consider for each element of the batch.
 period : 1 # Period of the sampled sequences.
 actor_lr: 1e-4  # the learning rate of the policy network optimizer

diff --git a/stoix/configs/system/ff_mz.yaml b/stoix/configs/system/ff_mz.yaml
@@ -8,12 +8,11 @@ system_name: ff_mz # Name of the system.
 
 # --- RL hyperparameters ---
 lr: 3e-4 # Learning rate for entire algorithm.
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 8 # Number of environment steps per vectorised environment.
 epochs: 8 # Number of epochs per training data batch.
 warmup_steps: 16  # Number of steps to collect before training.
-total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 sample_sequence_length: 6 # Number of steps to consider for each element of the batch.
 period : 1 # Period of the sampled sequences.
 gamma: 0.99 # Discounting factor.

diff --git a/stoix/configs/system/ff_ppo.yaml b/stoix/configs/system/ff_ppo.yaml
@@ -5,7 +5,6 @@ system_name: ff_ppo # Name of the system.
 # --- RL hyperparameters ---
 actor_lr: 3e-4 # Learning rate for actor network
 critic_lr: 3e-4 # Learning rate for critic network
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 16 # Number of environment steps per vectorised environment.
 epochs: 4 # Number of ppo epochs per training data batch.
 num_minibatches: 16 # Number of minibatches per ppo epoch.

diff --git a/stoix/configs/system/ff_qr_dqn.yaml b/stoix/configs/system/ff_qr_dqn.yaml
@@ -3,12 +3,11 @@
 system_name: ff_qr_dqn # Name of the system.
 
 # --- RL hyperparameters ---
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 1 # Number of environment steps per vectorised environment.
 epochs: 1 # Number of sgd steps per rollout.
 warmup_steps: 32  # Number of steps to collect before training.
-total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
-total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
+total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
+total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
 q_lr: 5e-5  # the learning rate of the Q network network optimizer
 tau: 0.005  # smoothing coefficient for target networks
 gamma: 0.99  # discount factor

diff --git a/stoix/configs/system/ff_reinforce.yaml b/stoix/configs/system/ff_reinforce.yaml
@@ -5,7 +5,6 @@ system_name: ff_reinforce # Name of the system.
 # --- RL hyperparameters ---
 actor_lr: 3e-4 # Learning rate for actor network
 critic_lr: 3e-4 # Learning rate for critic network
-update_batch_size: 1 # Number of vectorised gradient updates per device.
 rollout_length: 32 # Number of environment steps per vectorised environment.
 gamma: 0.99 # Discounting factor.
 ent_coef: 0.001 # Entropy regularisation term for loss function.