Skip to content

Commit

Permalink
Merge pull request #68 from EdanToledo/EdanToledo/issue66
Browse files Browse the repository at this point in the history
chore: Make Update Batch Size not affect num envs, buffer size and batch size
  • Loading branch information
EdanToledo authored May 1, 2024
2 parents 1a932d9 + eb3a3a0 commit 49ded3e
Show file tree
Hide file tree
Showing 49 changed files with 335 additions and 278 deletions.
3 changes: 2 additions & 1 deletion stoix/configs/arch/anakin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

# --- Training ---
seed: 42 # RNG seed.
total_num_envs: 1024 # Total Number of vectorised environments. Needs to be divisible by number of devices.
update_batch_size: 1 # Number of vectorised gradient updates per device.
total_num_envs: 1024 # Total Number of vectorised environments across all devices and batched_updates. Needs to be divisible by n_devices*update_batch_size.
total_timesteps: 1e7 # Set the total environment steps.
# If unspecified, it's derived from num_updates; otherwise, num_updates adjusts based on this value.
num_updates: ~ # Number of updates
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_awr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
system_name: ff_awr # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 2 # Number of environment steps per vectorised environment.
num_actor_steps: 100 # Number of sgd steps for the actor per rollout.
num_critic_steps: 20 # Number of sgd steps for the critic per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
sample_sequence_length: 16 # Number of steps to consider for each element of the batch.
period : 1 # Period of the sampled sequences.
actor_lr: 5e-5 # the learning rate of the policy network optimizer
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_az.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@ system_name: ff_az # Name of the system.
# --- RL hyperparameters ---
actor_lr: 3e-4 # Learning rate for actor network
critic_lr: 3e-4 # Learning rate for critic network
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 16 # Number of environment steps per vectorised environment.
epochs: 8 # Number of epochs per training data batch.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
sample_sequence_length: 16 # Number of steps to consider for each element of the batch.
period : 1 # Period of the sampled sequences.
gamma: 0.99 # Discounting factor.
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_c51.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_c51 # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
q_lr: 1e-4 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
gamma: 0.99 # discount factor
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_d4pg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_d4pg # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 32 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
actor_lr: 3e-4 # the learning rate of the policy network optimizer
q_lr: 3e-4 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_ddpg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_ddpg # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 32 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
actor_lr: 3e-4 # the learning rate of the policy network optimizer
q_lr: 3e-4 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
Expand Down
1 change: 0 additions & 1 deletion stoix/configs/system/ff_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ system_name: ff_dpo # Name of the system.
# --- RL hyperparameters ---
actor_lr: 3e-4 # Learning rate for actor network
critic_lr: 3e-4 # Learning rate for critic network
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 16 # Number of environment steps per vectorised environment.
epochs: 4 # Number of ppo epochs per training data batch.
num_minibatches: 16 # Number of minibatches per ppo epoch.
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_dqn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_dqn # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
q_lr: 1e-4 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
gamma: 0.99 # discount factor
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_dqn_reg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_dqn_reg # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
q_lr: 1e-5 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
gamma: 0.99 # discount factor
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_mdqn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_mdqn # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
q_lr: 1e-4 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
gamma: 0.99 # discount factor
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_mpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_mpo # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 8 # Number of environment steps per vectorised environment.
epochs: 16 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 50_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
sample_sequence_length: 8 # Number of steps to consider for each element of the batch.
period : 1 # Period of the sampled sequences.
actor_lr: 1e-4 # the learning rate of the policy network optimizer
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_mpo_continuous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_mpo # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 8 # Number of environment steps per vectorised environment.
epochs: 32 # Number of sgd steps per rollout.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
sample_sequence_length: 8 # Number of steps to consider for each element of the batch.
period : 1 # Period of the sampled sequences.
actor_lr: 1e-4 # the learning rate of the policy network optimizer
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_mz.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@ system_name: ff_mz # Name of the system.

# --- RL hyperparameters ---
lr: 3e-4 # Learning rate for entire algorithm.
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 8 # Number of environment steps per vectorised environment.
epochs: 8 # Number of epochs per training data batch.
warmup_steps: 16 # Number of steps to collect before training.
total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 25_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 32 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
sample_sequence_length: 6 # Number of steps to consider for each element of the batch.
period : 1 # Period of the sampled sequences.
gamma: 0.99 # Discounting factor.
Expand Down
1 change: 0 additions & 1 deletion stoix/configs/system/ff_ppo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ system_name: ff_ppo # Name of the system.
# --- RL hyperparameters ---
actor_lr: 3e-4 # Learning rate for actor network
critic_lr: 3e-4 # Learning rate for critic network
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 16 # Number of environment steps per vectorised environment.
epochs: 4 # Number of ppo epochs per training data batch.
num_minibatches: 16 # Number of minibatches per ppo epoch.
Expand Down
5 changes: 2 additions & 3 deletions stoix/configs/system/ff_qr_dqn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
system_name: ff_qr_dqn # Name of the system.

# --- RL hyperparameters ---
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 1 # Number of environment steps per vectorised environment.
epochs: 1 # Number of sgd steps per rollout.
warmup_steps: 32 # Number of steps to collect before training.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices. This means each device has a buffer of size buffer_size/num_devices. This must be divisible by num_devices.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices. This must be divisible by num_devices.
total_buffer_size: 500_000 # Total effective size of the replay buffer across all devices and vectorised update steps. This means each device has a buffer of size buffer_size//num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
total_batch_size: 256 # Total effective number of samples to train on. This means each device has a batch size of batch_size/num_devices which is further divided by the update_batch_size. This value must be divisible by num_devices*update_batch_size.
q_lr: 5e-5 # the learning rate of the Q network network optimizer
tau: 0.005 # smoothing coefficient for target networks
gamma: 0.99 # discount factor
Expand Down
1 change: 0 additions & 1 deletion stoix/configs/system/ff_reinforce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ system_name: ff_reinforce # Name of the system.
# --- RL hyperparameters ---
actor_lr: 3e-4 # Learning rate for actor network
critic_lr: 3e-4 # Learning rate for critic network
update_batch_size: 1 # Number of vectorised gradient updates per device.
rollout_length: 32 # Number of environment steps per vectorised environment.
gamma: 0.99 # Discounting factor.
ent_coef: 0.001 # Entropy regularisation term for loss function.
Expand Down
Loading

0 comments on commit 49ded3e

Please sign in to comment.