From 97a0d4edaa5a3aa8d0d0871585809c2e230e7a7f Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 15:03:14 +0200 Subject: [PATCH 01/11] test --- rllib/policy/torch_policy_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 4a78b6848d55..6199b55f9af4 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -683,7 +683,7 @@ def load_batch_into_buffer( ) # 3) Load splits into the given buffer (consisting of n GPUs). - slices = [slice.to_device(self.devices[i]) for i, slice in enumerate(slices)] + slices = [slice.to_device(self.devices[buffer_index]) for slice in slices] self._loaded_batches[buffer_index] = slices # Return loaded samples per-device. From cf377f7a2862034935bfc5a11f9498f29d8d127d Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 15:38:11 +0200 Subject: [PATCH 02/11] another test --- rllib/policy/torch_policy_v2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 6199b55f9af4..3817d79a015e 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -683,7 +683,7 @@ def load_batch_into_buffer( ) # 3) Load splits into the given buffer (consisting of n GPUs). - slices = [slice.to_device(self.devices[buffer_index]) for slice in slices] + slices = [slice.to_device(self.devices[i]) for i, slice in enumerate(slices)] self._loaded_batches[buffer_index] = slices # Return loaded samples per-device. @@ -1194,6 +1194,8 @@ def _worker(shard_idx, model, sample_batch, device): raise last_result[0] from last_result[1] # Multi device (GPU) case: Parallelize via threads. else: + [logger.logerr((i , j) for i,j in zip(self.model_gpu_towers, + sample_batches))] threads = [ threading.Thread( target=_worker, args=(shard_idx, model, sample_batch, device) From 7f60cc16f2dce267030d63cabf814645e32fd724 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 17:43:00 +0200 Subject: [PATCH 03/11] next test --- rllib/policy/torch_policy_v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 3817d79a015e..dc9ff0fa163f 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1100,6 +1100,10 @@ def _multi_gpu_parallel_grad_calc( results = {} grad_enabled = torch.is_grad_enabled() + logger.warning("MyMarker") + [logger.warning(i, j) for i, j in zip(self.model_gpu_towers, + sample_batches)] + def _worker(shard_idx, model, sample_batch, device): torch.set_grad_enabled(grad_enabled) try: @@ -1194,8 +1198,6 @@ def _worker(shard_idx, model, sample_batch, device): raise last_result[0] from last_result[1] # Multi device (GPU) case: Parallelize via threads. else: - [logger.logerr((i , j) for i,j in zip(self.model_gpu_towers, - sample_batches))] threads = [ threading.Thread( target=_worker, args=(shard_idx, model, sample_batch, device) From 59aad5818077d8f6eee71939f0b841b0d2d99005 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 17:44:23 +0200 Subject: [PATCH 04/11] test only simpleQ --- .../multi_gpu_learning_tests.yaml | 221 ------------------ 1 file changed, 221 deletions(-) diff --git a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml index bc5d62710f3d..dc66443d42f1 100644 --- a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml @@ -1,204 +1,3 @@ - -a2c-cartpole-v0: - env: CartPole-v0 - run: A2C - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.001 - -appo-cartpole-v0-no-vtrace: - env: CartPole-v0 - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - vtrace: false - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -appo-cartpole-v0-vtrace: - env: CartPole-v0 - run: APPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 5 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 1000 - -ddpg-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: DDPG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: -50.0 - timesteps_total: 8000 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 512 - -dqn-cartpole-v0: - env: CartPole-v0 - run: DQN - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 50000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 64 - # Mimic tuned_example for DQN CartPole. - n_step: 3 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - -impala-cartpole-v0: - env: CartPole-v0 - run: IMPALA - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 1000 - -pg-cartpole-v0: - env: CartPole-v0 - run: PG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 130.0 - timesteps_total: 500000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - # Double batch size (2 GPUs). - train_batch_size: 400 - model: - fcnet_hiddens: [64] - fcnet_activation: linear - -ppo-cartpole-v0: - env: CartPole-v0 - run: PPO - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 150.0 - timesteps_total: 300000 - stop: - time_total_s: 600 - config: - num_gpus: 2 - num_workers: 23 - lr: 0.0003 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - # Double batch size (2 GPUs). - train_batch_size: 8000 - -sac-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: SAC - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: 40.0 - timesteps_total: 4500 - stop: - time_total_s: 600 - config: - env_config: - config: - repeat_delay: 0 - num_gpus: 2 - num_workers: 0 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - initial_alpha: 0.001 - # Double batch size (2 GPUs). - train_batch_size: 512 - -sac-repeat-after-me-env-continuous: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: SAC - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: -50.0 - timesteps_total: 4500 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - replay_buffer_config: - type: MultiAgentPrioritizedReplayBuffer - num_gpus: 2 - num_workers: 0 - initial_alpha: 0.001 - # Double batch size (2 GPUs). - train_batch_size: 512 - simpleq-cartpole-v0: env: CartPole-v0 run: SimpleQ @@ -211,23 +10,3 @@ simpleq-cartpole-v0: config: num_gpus: 2 num_workers: 0 - -td3-repeat-after-me-env: - env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv - run: TD3 - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: -50.0 - timesteps_total: 25000 - stop: - time_total_s: 600 - config: - env_config: - config: - continuous: true - repeat_delay: 0 - - num_gpus: 2 - num_workers: 0 - # Double batch size (2 GPUs). - train_batch_size: 200 From a6c790982201d5e4568484de910ef9006fff24b6 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 18:02:53 +0200 Subject: [PATCH 05/11] raises device data as exception --- rllib/policy/torch_policy_v2.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index dc9ff0fa163f..f73214f325ac 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -751,6 +751,18 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): ) batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics} + error_str = str( + { + "tensor_devices": [t["obs"].get_device() for t in device_batches], + "model_devices": [next(m.parameters()).device for m in + self.model_gpu_towers], + "multi_gpu_components": zip(self.model_gpu_towers, device_batches, + self.devices), + } + ) + + raise Exception(error_str) + # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches) @@ -1100,10 +1112,6 @@ def _multi_gpu_parallel_grad_calc( results = {} grad_enabled = torch.is_grad_enabled() - logger.warning("MyMarker") - [logger.warning(i, j) for i, j in zip(self.model_gpu_towers, - sample_batches)] - def _worker(shard_idx, model, sample_batch, device): torch.set_grad_enabled(grad_enabled) try: From 8cb5b60ad91a666201068e10a677b057cad14534 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 18:15:50 +0200 Subject: [PATCH 06/11] only log --- rllib/policy/torch_policy_v2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index f73214f325ac..8560d5f14b04 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -751,17 +751,17 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): ) batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics} - error_str = str( + log_str = str( { "tensor_devices": [t["obs"].get_device() for t in device_batches], "model_devices": [next(m.parameters()).device for m in self.model_gpu_towers], - "multi_gpu_components": zip(self.model_gpu_towers, device_batches, - self.devices), + "multi_gpu_components": list(zip(self.model_gpu_towers, device_batches, + self.devices)), } ) - raise Exception(error_str) + logger.warning("MyMarker" + log_str) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches) From fba6ca9136178e743d1fd16608212473af79e4ba Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 18:26:45 +0200 Subject: [PATCH 07/11] log in parallel grad calc --- rllib/policy/torch_policy_v2.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 8560d5f14b04..e5d58e50649e 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -756,12 +756,10 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): "tensor_devices": [t["obs"].get_device() for t in device_batches], "model_devices": [next(m.parameters()).device for m in self.model_gpu_towers], - "multi_gpu_components": list(zip(self.model_gpu_towers, device_batches, - self.devices)), } ) - logger.warning("MyMarker" + log_str) + logger.warning("tensors_and_model_devices" + log_str) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches) @@ -1114,6 +1112,18 @@ def _multi_gpu_parallel_grad_calc( def _worker(shard_idx, model, sample_batch, device): torch.set_grad_enabled(grad_enabled) + + log_str = str( + { + "shard_idx": shard_idx, + "tensor_devices": [sample_batch["obs"].get_device()], + "model_device": next(model.parameters()).device, + "device": device + } + ) + + logger.warning("parallel_calc_tensor_and_model_devices" + log_str) + try: with NullContextManager() if device.type == "cpu" else torch.cuda.device( # noqa: E501 device From 2ce484f5bf1d1f802ffd5d05aa68dd00b3d02af9 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 18:50:51 +0200 Subject: [PATCH 08/11] log target model device --- rllib/policy/torch_policy_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index e5d58e50649e..6e3215906c50 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1118,6 +1118,7 @@ def _worker(shard_idx, model, sample_batch, device): "shard_idx": shard_idx, "tensor_devices": [sample_batch["obs"].get_device()], "model_device": next(model.parameters()).device, + "target_model_device": next(self.target_models[model].parameters()).device, "device": device } ) From c4a42589a807517b05ace7e02389b49d8c4b78fd Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 19:28:47 +0200 Subject: [PATCH 09/11] place samplebatch on correct device --- .../simple_q/simple_q_torch_policy.py | 2 +- rllib/policy/torch_policy_v2.py | 26 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/rllib/algorithms/simple_q/simple_q_torch_policy.py b/rllib/algorithms/simple_q/simple_q_torch_policy.py index 7ceb81d6f5e2..b33e38a185d5 100644 --- a/rllib/algorithms/simple_q/simple_q_torch_policy.py +++ b/rllib/algorithms/simple_q/simple_q_torch_policy.py @@ -174,7 +174,7 @@ def _compute_q_values( ) -> TensorType: _is_training = is_training if is_training is not None else False input_dict = self._lazy_tensor_dict( - SampleBatch(obs=obs_batch, _is_training=_is_training) + SampleBatch(obs=obs_batch, _is_training=_is_training), device=next(model.parameters()).device ) # Make sure, everything is PyTorch tensors. model_out, _ = model(input_dict, [], None) diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 6e3215906c50..ae6b316dd479 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -1112,18 +1112,20 @@ def _multi_gpu_parallel_grad_calc( def _worker(shard_idx, model, sample_batch, device): torch.set_grad_enabled(grad_enabled) - - log_str = str( - { - "shard_idx": shard_idx, - "tensor_devices": [sample_batch["obs"].get_device()], - "model_device": next(model.parameters()).device, - "target_model_device": next(self.target_models[model].parameters()).device, - "device": device - } - ) - - logger.warning("parallel_calc_tensor_and_model_devices" + log_str) + # + # log_str = str( + # { + # "shard_idx": shard_idx, + # "tensor_devices": [sample_batch["obs"].get_device()], + # "model_device": next(model.parameters()).device, + # "target_model_device": next(self.target_models[model].parameters()).device, + # "device": device, + # "action_distribution": self.dist_class, + # + # } + # ) + + # logger.warning("parallel_calc_tensor_and_model_devices" + log_str) try: with NullContextManager() if device.type == "cpu" else torch.cuda.device( # noqa: E501 From 40fdd54fa01e3538326428d4dd9a7af8b071b451 Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 19:32:54 +0200 Subject: [PATCH 10/11] possible fix --- .../simple_q/simple_q_torch_policy.py | 4 +-- rllib/policy/torch_policy_v2.py | 25 ------------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/rllib/algorithms/simple_q/simple_q_torch_policy.py b/rllib/algorithms/simple_q/simple_q_torch_policy.py index b33e38a185d5..dacc66e21f70 100644 --- a/rllib/algorithms/simple_q/simple_q_torch_policy.py +++ b/rllib/algorithms/simple_q/simple_q_torch_policy.py @@ -173,9 +173,7 @@ def _compute_q_values( self, model: ModelV2, obs_batch: TensorType, is_training=None ) -> TensorType: _is_training = is_training if is_training is not None else False - input_dict = self._lazy_tensor_dict( - SampleBatch(obs=obs_batch, _is_training=_is_training), device=next(model.parameters()).device - ) + input_dict = SampleBatch(obs=obs_batch, _is_training=_is_training) # Make sure, everything is PyTorch tensors. model_out, _ = model(input_dict, [], None) return model_out diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index ae6b316dd479..4a78b6848d55 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -751,16 +751,6 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): ) batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics} - log_str = str( - { - "tensor_devices": [t["obs"].get_device() for t in device_batches], - "model_devices": [next(m.parameters()).device for m in - self.model_gpu_towers], - } - ) - - logger.warning("tensors_and_model_devices" + log_str) - # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches) @@ -1112,21 +1102,6 @@ def _multi_gpu_parallel_grad_calc( def _worker(shard_idx, model, sample_batch, device): torch.set_grad_enabled(grad_enabled) - # - # log_str = str( - # { - # "shard_idx": shard_idx, - # "tensor_devices": [sample_batch["obs"].get_device()], - # "model_device": next(model.parameters()).device, - # "target_model_device": next(self.target_models[model].parameters()).device, - # "device": device, - # "action_distribution": self.dist_class, - # - # } - # ) - - # logger.warning("parallel_calc_tensor_and_model_devices" + log_str) - try: with NullContextManager() if device.type == "cpu" else torch.cuda.device( # noqa: E501 device From bc47f1b6d88c7fbf39411541ab79f58e4f1c7d1e Mon Sep 17 00:00:00 2001 From: Artur Niederfahrenhorst Date: Sun, 26 Jun 2022 19:34:55 +0200 Subject: [PATCH 11/11] revert release tests folder --- .../multi_gpu_learning_tests.yaml | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml index dc66443d42f1..bc5d62710f3d 100644 --- a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml @@ -1,3 +1,204 @@ + +a2c-cartpole-v0: + env: CartPole-v0 + run: A2C + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 500000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 23 + lr: 0.001 + +appo-cartpole-v0-no-vtrace: + env: CartPole-v0 + run: APPO + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 500000 + stop: + time_total_s: 600 + config: + vtrace: false + num_gpus: 2 + num_workers: 5 + lr: 0.0003 + observation_filter: MeanStdFilter + num_sgd_iter: 6 + vf_loss_coeff: 0.01 + model: + fcnet_hiddens: [32] + fcnet_activation: linear + vf_share_layers: true + # Double batch size (2 GPUs). + train_batch_size: 1000 + +appo-cartpole-v0-vtrace: + env: CartPole-v0 + run: APPO + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 500000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 5 + lr: 0.0003 + observation_filter: MeanStdFilter + num_sgd_iter: 6 + vf_loss_coeff: 0.01 + model: + fcnet_hiddens: [32] + fcnet_activation: linear + vf_share_layers: true + # Double batch size (2 GPUs). + train_batch_size: 1000 + +ddpg-repeat-after-me-env: + env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv + run: DDPG + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: -50.0 + timesteps_total: 8000 + stop: + time_total_s: 600 + config: + env_config: + config: + continuous: true + repeat_delay: 0 + + num_gpus: 2 + num_workers: 0 + # Double batch size (2 GPUs). + train_batch_size: 512 + +dqn-cartpole-v0: + env: CartPole-v0 + run: DQN + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 50000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 0 + # Double batch size (2 GPUs). + train_batch_size: 64 + # Mimic tuned_example for DQN CartPole. + n_step: 3 + model: + fcnet_hiddens: [64] + fcnet_activation: linear + +impala-cartpole-v0: + env: CartPole-v0 + run: IMPALA + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 500000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 23 + # Double batch size (2 GPUs). + train_batch_size: 1000 + +pg-cartpole-v0: + env: CartPole-v0 + run: PG + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 130.0 + timesteps_total: 500000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 23 + # Double batch size (2 GPUs). + train_batch_size: 400 + model: + fcnet_hiddens: [64] + fcnet_activation: linear + +ppo-cartpole-v0: + env: CartPole-v0 + run: PPO + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 150.0 + timesteps_total: 300000 + stop: + time_total_s: 600 + config: + num_gpus: 2 + num_workers: 23 + lr: 0.0003 + observation_filter: MeanStdFilter + num_sgd_iter: 6 + vf_loss_coeff: 0.01 + model: + fcnet_hiddens: [32] + fcnet_activation: linear + vf_share_layers: true + # Double batch size (2 GPUs). + train_batch_size: 8000 + +sac-repeat-after-me-env: + env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv + run: SAC + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 40.0 + timesteps_total: 4500 + stop: + time_total_s: 600 + config: + env_config: + config: + repeat_delay: 0 + num_gpus: 2 + num_workers: 0 + replay_buffer_config: + type: MultiAgentPrioritizedReplayBuffer + initial_alpha: 0.001 + # Double batch size (2 GPUs). + train_batch_size: 512 + +sac-repeat-after-me-env-continuous: + env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv + run: SAC + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: -50.0 + timesteps_total: 4500 + stop: + time_total_s: 600 + config: + env_config: + config: + continuous: true + repeat_delay: 0 + replay_buffer_config: + type: MultiAgentPrioritizedReplayBuffer + num_gpus: 2 + num_workers: 0 + initial_alpha: 0.001 + # Double batch size (2 GPUs). + train_batch_size: 512 + simpleq-cartpole-v0: env: CartPole-v0 run: SimpleQ @@ -10,3 +211,23 @@ simpleq-cartpole-v0: config: num_gpus: 2 num_workers: 0 + +td3-repeat-after-me-env: + env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv + run: TD3 + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: -50.0 + timesteps_total: 25000 + stop: + time_total_s: 600 + config: + env_config: + config: + continuous: true + repeat_delay: 0 + + num_gpus: 2 + num_workers: 0 + # Double batch size (2 GPUs). + train_batch_size: 200