From 97a0d4edaa5a3aa8d0d0871585809c2e230e7a7f Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 15:03:14 +0200
Subject: [PATCH 01/11] test

---
 rllib/policy/torch_policy_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 4a78b6848d55..6199b55f9af4 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -683,7 +683,7 @@ def load_batch_into_buffer(
             )
 
         # 3) Load splits into the given buffer (consisting of n GPUs).
-        slices = [slice.to_device(self.devices[i]) for i, slice in enumerate(slices)]
+        slices = [slice.to_device(self.devices[buffer_index]) for slice in slices]
         self._loaded_batches[buffer_index] = slices
 
         # Return loaded samples per-device.

From cf377f7a2862034935bfc5a11f9498f29d8d127d Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 15:38:11 +0200
Subject: [PATCH 02/11] another test

---
 rllib/policy/torch_policy_v2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 6199b55f9af4..3817d79a015e 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -683,7 +683,7 @@ def load_batch_into_buffer(
             )
 
         # 3) Load splits into the given buffer (consisting of n GPUs).
-        slices = [slice.to_device(self.devices[buffer_index]) for slice in slices]
+        slices = [slice.to_device(self.devices[i]) for i, slice in enumerate(slices)]
         self._loaded_batches[buffer_index] = slices
 
         # Return loaded samples per-device.
@@ -1194,6 +1194,8 @@ def _worker(shard_idx, model, sample_batch, device):
                     raise last_result[0] from last_result[1]
         # Multi device (GPU) case: Parallelize via threads.
         else:
+            [logger.logerr((i , j) for i,j in zip(self.model_gpu_towers,
+                                                  sample_batches))]
             threads = [
                 threading.Thread(
                     target=_worker, args=(shard_idx, model, sample_batch, device)

From 7f60cc16f2dce267030d63cabf814645e32fd724 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 17:43:00 +0200
Subject: [PATCH 03/11] next test

---
 rllib/policy/torch_policy_v2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 3817d79a015e..dc9ff0fa163f 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -1100,6 +1100,10 @@ def _multi_gpu_parallel_grad_calc(
         results = {}
         grad_enabled = torch.is_grad_enabled()
 
+        logger.warning("MyMarker")
+        [logger.warning(i, j) for i, j in zip(self.model_gpu_towers,
+                                               sample_batches)]
+
         def _worker(shard_idx, model, sample_batch, device):
             torch.set_grad_enabled(grad_enabled)
             try:
@@ -1194,8 +1198,6 @@ def _worker(shard_idx, model, sample_batch, device):
                     raise last_result[0] from last_result[1]
         # Multi device (GPU) case: Parallelize via threads.
         else:
-            [logger.logerr((i , j) for i,j in zip(self.model_gpu_towers,
-                                                  sample_batches))]
             threads = [
                 threading.Thread(
                     target=_worker, args=(shard_idx, model, sample_batch, device)

From 59aad5818077d8f6eee71939f0b841b0d2d99005 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 17:44:23 +0200
Subject: [PATCH 04/11] test only simpleQ

---
 .../multi_gpu_learning_tests.yaml             | 221 ------------------
 1 file changed, 221 deletions(-)

diff --git a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
index bc5d62710f3d..dc66443d42f1 100644
--- a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
+++ b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
@@ -1,204 +1,3 @@
-
-a2c-cartpole-v0:
-    env: CartPole-v0
-    run: A2C
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 500000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 23
-        lr: 0.001
-
-appo-cartpole-v0-no-vtrace:
-    env: CartPole-v0
-    run: APPO
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 500000
-    stop:
-        time_total_s: 600
-    config:
-        vtrace: false
-        num_gpus: 2
-        num_workers: 5
-        lr: 0.0003
-        observation_filter: MeanStdFilter
-        num_sgd_iter: 6
-        vf_loss_coeff: 0.01
-        model:
-            fcnet_hiddens: [32]
-            fcnet_activation: linear
-            vf_share_layers: true
-        # Double batch size (2 GPUs).
-        train_batch_size: 1000
-
-appo-cartpole-v0-vtrace:
-    env: CartPole-v0
-    run: APPO
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 500000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 5
-        lr: 0.0003
-        observation_filter: MeanStdFilter
-        num_sgd_iter: 6
-        vf_loss_coeff: 0.01
-        model:
-            fcnet_hiddens: [32]
-            fcnet_activation: linear
-            vf_share_layers: true
-        # Double batch size (2 GPUs).
-        train_batch_size: 1000
-
-ddpg-repeat-after-me-env:
-    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
-    run: DDPG
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: -50.0
-        timesteps_total: 8000
-    stop:
-        time_total_s: 600
-    config:
-        env_config:
-            config:
-                continuous: true
-                repeat_delay: 0
-
-        num_gpus: 2
-        num_workers: 0
-        # Double batch size (2 GPUs).
-        train_batch_size: 512
-
-dqn-cartpole-v0:
-    env: CartPole-v0
-    run: DQN
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 50000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 0
-        # Double batch size (2 GPUs).
-        train_batch_size: 64
-        # Mimic tuned_example for DQN CartPole.
-        n_step: 3
-        model:
-            fcnet_hiddens: [64]
-            fcnet_activation: linear
-
-impala-cartpole-v0:
-    env: CartPole-v0
-    run: IMPALA
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 500000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 23
-        # Double batch size (2 GPUs).
-        train_batch_size: 1000
-
-pg-cartpole-v0:
-    env: CartPole-v0
-    run: PG
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 130.0
-        timesteps_total: 500000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 23
-        # Double batch size (2 GPUs).
-        train_batch_size: 400
-        model:
-            fcnet_hiddens: [64]
-            fcnet_activation: linear
-
-ppo-cartpole-v0:
-    env: CartPole-v0
-    run: PPO
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 150.0
-        timesteps_total: 300000
-    stop:
-        time_total_s: 600
-    config:
-        num_gpus: 2
-        num_workers: 23
-        lr: 0.0003
-        observation_filter: MeanStdFilter
-        num_sgd_iter: 6
-        vf_loss_coeff: 0.01
-        model:
-            fcnet_hiddens: [32]
-            fcnet_activation: linear
-            vf_share_layers: true
-        # Double batch size (2 GPUs).
-        train_batch_size: 8000
-
-sac-repeat-after-me-env:
-    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
-    run: SAC
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: 40.0
-        timesteps_total: 4500
-    stop:
-        time_total_s: 600
-    config:
-        env_config:
-            config:
-                repeat_delay: 0
-        num_gpus: 2
-        num_workers: 0
-        replay_buffer_config:
-          type: MultiAgentPrioritizedReplayBuffer
-        initial_alpha: 0.001
-        # Double batch size (2 GPUs).
-        train_batch_size: 512
-
-sac-repeat-after-me-env-continuous:
-    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
-    run: SAC
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: -50.0
-        timesteps_total: 4500
-    stop:
-        time_total_s: 600
-    config:
-        env_config:
-            config:
-                continuous: true
-                repeat_delay: 0
-        replay_buffer_config:
-          type: MultiAgentPrioritizedReplayBuffer
-        num_gpus: 2
-        num_workers: 0
-        initial_alpha: 0.001
-        # Double batch size (2 GPUs).
-        train_batch_size: 512
-
 simpleq-cartpole-v0:
     env: CartPole-v0
     run: SimpleQ
@@ -211,23 +10,3 @@ simpleq-cartpole-v0:
     config:
         num_gpus: 2
         num_workers: 0
-
-td3-repeat-after-me-env:
-    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
-    run: TD3
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: -50.0
-        timesteps_total: 25000
-    stop:
-        time_total_s: 600
-    config:
-        env_config:
-            config:
-                continuous: true
-                repeat_delay: 0
-
-        num_gpus: 2
-        num_workers: 0
-        # Double batch size (2 GPUs).
-        train_batch_size: 200

From a6c790982201d5e4568484de910ef9006fff24b6 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 18:02:53 +0200
Subject: [PATCH 05/11] raises device data as exception

---
 rllib/policy/torch_policy_v2.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index dc9ff0fa163f..f73214f325ac 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -751,6 +751,18 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
             )
             batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics}
 
+        error_str = str(
+            {
+                "tensor_devices": [t["obs"].get_device() for t in device_batches],
+                "model_devices": [next(m.parameters()).device for m in
+                                  self.model_gpu_towers],
+                "multi_gpu_components": zip(self.model_gpu_towers, device_batches,
+                                            self.devices),
+            }
+        )
+
+        raise Exception(error_str)
+
         # Do the (maybe parallelized) gradient calculation step.
         tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches)
 
@@ -1100,10 +1112,6 @@ def _multi_gpu_parallel_grad_calc(
         results = {}
         grad_enabled = torch.is_grad_enabled()
 
-        logger.warning("MyMarker")
-        [logger.warning(i, j) for i, j in zip(self.model_gpu_towers,
-                                               sample_batches)]
-
         def _worker(shard_idx, model, sample_batch, device):
             torch.set_grad_enabled(grad_enabled)
             try:

From 8cb5b60ad91a666201068e10a677b057cad14534 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 18:15:50 +0200
Subject: [PATCH 06/11] only log

---
 rllib/policy/torch_policy_v2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index f73214f325ac..8560d5f14b04 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -751,17 +751,17 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
             )
             batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics}
 
-        error_str = str(
+        log_str = str(
             {
                 "tensor_devices": [t["obs"].get_device() for t in device_batches],
                 "model_devices": [next(m.parameters()).device for m in
                                   self.model_gpu_towers],
-                "multi_gpu_components": zip(self.model_gpu_towers, device_batches,
-                                            self.devices),
+                "multi_gpu_components": list(zip(self.model_gpu_towers, device_batches,
+                                            self.devices)),
             }
         )
 
-        raise Exception(error_str)
+        logger.warning("MyMarker" + log_str)
 
         # Do the (maybe parallelized) gradient calculation step.
         tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches)

From fba6ca9136178e743d1fd16608212473af79e4ba Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 18:26:45 +0200
Subject: [PATCH 07/11] log in parallel grad calc

---
 rllib/policy/torch_policy_v2.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 8560d5f14b04..e5d58e50649e 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -756,12 +756,10 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
                 "tensor_devices": [t["obs"].get_device() for t in device_batches],
                 "model_devices": [next(m.parameters()).device for m in
                                   self.model_gpu_towers],
-                "multi_gpu_components": list(zip(self.model_gpu_towers, device_batches,
-                                            self.devices)),
             }
         )
 
-        logger.warning("MyMarker" + log_str)
+        logger.warning("tensors_and_model_devices" + log_str)
 
         # Do the (maybe parallelized) gradient calculation step.
         tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches)
@@ -1114,6 +1112,18 @@ def _multi_gpu_parallel_grad_calc(
 
         def _worker(shard_idx, model, sample_batch, device):
             torch.set_grad_enabled(grad_enabled)
+
+            log_str = str(
+                {
+                    "shard_idx": shard_idx,
+                    "tensor_devices": [sample_batch["obs"].get_device()],
+                    "model_device": next(model.parameters()).device,
+                    "device": device
+                }
+            )
+
+            logger.warning("parallel_calc_tensor_and_model_devices" + log_str)
+
             try:
                 with NullContextManager() if device.type == "cpu" else torch.cuda.device(  # noqa: E501
                     device

From 2ce484f5bf1d1f802ffd5d05aa68dd00b3d02af9 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 18:50:51 +0200
Subject: [PATCH 08/11] log target model device

---
 rllib/policy/torch_policy_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index e5d58e50649e..6e3215906c50 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -1118,6 +1118,7 @@ def _worker(shard_idx, model, sample_batch, device):
                     "shard_idx": shard_idx,
                     "tensor_devices": [sample_batch["obs"].get_device()],
                     "model_device": next(model.parameters()).device,
+                    "target_model_device": next(self.target_models[model].parameters()).device,
                     "device": device
                 }
             )

From c4a42589a807517b05ace7e02389b49d8c4b78fd Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 19:28:47 +0200
Subject: [PATCH 09/11] place samplebatch on correct device

---
 .../simple_q/simple_q_torch_policy.py         |  2 +-
 rllib/policy/torch_policy_v2.py               | 26 ++++++++++---------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/rllib/algorithms/simple_q/simple_q_torch_policy.py b/rllib/algorithms/simple_q/simple_q_torch_policy.py
index 7ceb81d6f5e2..b33e38a185d5 100644
--- a/rllib/algorithms/simple_q/simple_q_torch_policy.py
+++ b/rllib/algorithms/simple_q/simple_q_torch_policy.py
@@ -174,7 +174,7 @@ def _compute_q_values(
     ) -> TensorType:
         _is_training = is_training if is_training is not None else False
         input_dict = self._lazy_tensor_dict(
-            SampleBatch(obs=obs_batch, _is_training=_is_training)
+            SampleBatch(obs=obs_batch, _is_training=_is_training), device=next(model.parameters()).device
         )
         # Make sure, everything is PyTorch tensors.
         model_out, _ = model(input_dict, [], None)
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index 6e3215906c50..ae6b316dd479 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -1112,18 +1112,20 @@ def _multi_gpu_parallel_grad_calc(
 
         def _worker(shard_idx, model, sample_batch, device):
             torch.set_grad_enabled(grad_enabled)
-
-            log_str = str(
-                {
-                    "shard_idx": shard_idx,
-                    "tensor_devices": [sample_batch["obs"].get_device()],
-                    "model_device": next(model.parameters()).device,
-                    "target_model_device": next(self.target_models[model].parameters()).device,
-                    "device": device
-                }
-            )
-
-            logger.warning("parallel_calc_tensor_and_model_devices" + log_str)
+            #
+            # log_str = str(
+            #     {
+            #         "shard_idx": shard_idx,
+            #         "tensor_devices": [sample_batch["obs"].get_device()],
+            #         "model_device": next(model.parameters()).device,
+            #         "target_model_device": next(self.target_models[model].parameters()).device,
+            #         "device": device,
+            #         "action_distribution": self.dist_class,
+            #
+            #     }
+            # )
+
+            # logger.warning("parallel_calc_tensor_and_model_devices" + log_str)
 
             try:
                 with NullContextManager() if device.type == "cpu" else torch.cuda.device(  # noqa: E501

From 40fdd54fa01e3538326428d4dd9a7af8b071b451 Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 19:32:54 +0200
Subject: [PATCH 10/11] possible fix

---
 .../simple_q/simple_q_torch_policy.py         |  4 +--
 rllib/policy/torch_policy_v2.py               | 25 -------------------
 2 files changed, 1 insertion(+), 28 deletions(-)

diff --git a/rllib/algorithms/simple_q/simple_q_torch_policy.py b/rllib/algorithms/simple_q/simple_q_torch_policy.py
index b33e38a185d5..dacc66e21f70 100644
--- a/rllib/algorithms/simple_q/simple_q_torch_policy.py
+++ b/rllib/algorithms/simple_q/simple_q_torch_policy.py
@@ -173,9 +173,7 @@ def _compute_q_values(
         self, model: ModelV2, obs_batch: TensorType, is_training=None
     ) -> TensorType:
         _is_training = is_training if is_training is not None else False
-        input_dict = self._lazy_tensor_dict(
-            SampleBatch(obs=obs_batch, _is_training=_is_training), device=next(model.parameters()).device
-        )
+        input_dict = SampleBatch(obs=obs_batch, _is_training=_is_training)
         # Make sure, everything is PyTorch tensors.
         model_out, _ = model(input_dict, [], None)
         return model_out
diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py
index ae6b316dd479..4a78b6848d55 100644
--- a/rllib/policy/torch_policy_v2.py
+++ b/rllib/policy/torch_policy_v2.py
@@ -751,16 +751,6 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
             )
             batch_fetches[f"tower_{i}"] = {"custom_metrics": custom_metrics}
 
-        log_str = str(
-            {
-                "tensor_devices": [t["obs"].get_device() for t in device_batches],
-                "model_devices": [next(m.parameters()).device for m in
-                                  self.model_gpu_towers],
-            }
-        )
-
-        logger.warning("tensors_and_model_devices" + log_str)
-
         # Do the (maybe parallelized) gradient calculation step.
         tower_outputs = self._multi_gpu_parallel_grad_calc(device_batches)
 
@@ -1112,21 +1102,6 @@ def _multi_gpu_parallel_grad_calc(
 
         def _worker(shard_idx, model, sample_batch, device):
             torch.set_grad_enabled(grad_enabled)
-            #
-            # log_str = str(
-            #     {
-            #         "shard_idx": shard_idx,
-            #         "tensor_devices": [sample_batch["obs"].get_device()],
-            #         "model_device": next(model.parameters()).device,
-            #         "target_model_device": next(self.target_models[model].parameters()).device,
-            #         "device": device,
-            #         "action_distribution": self.dist_class,
-            #
-            #     }
-            # )
-
-            # logger.warning("parallel_calc_tensor_and_model_devices" + log_str)
-
             try:
                 with NullContextManager() if device.type == "cpu" else torch.cuda.device(  # noqa: E501
                     device

From bc47f1b6d88c7fbf39411541ab79f58e4f1c7d1e Mon Sep 17 00:00:00 2001
From: Artur Niederfahrenhorst <artur@anyscale.com>
Date: Sun, 26 Jun 2022 19:34:55 +0200
Subject: [PATCH 11/11] revert release tests folder

---
 .../multi_gpu_learning_tests.yaml             | 221 ++++++++++++++++++
 1 file changed, 221 insertions(+)

diff --git a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
index dc66443d42f1..bc5d62710f3d 100644
--- a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
+++ b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml
@@ -1,3 +1,204 @@
+
+a2c-cartpole-v0:
+    env: CartPole-v0
+    run: A2C
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 500000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 23
+        lr: 0.001
+
+appo-cartpole-v0-no-vtrace:
+    env: CartPole-v0
+    run: APPO
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 500000
+    stop:
+        time_total_s: 600
+    config:
+        vtrace: false
+        num_gpus: 2
+        num_workers: 5
+        lr: 0.0003
+        observation_filter: MeanStdFilter
+        num_sgd_iter: 6
+        vf_loss_coeff: 0.01
+        model:
+            fcnet_hiddens: [32]
+            fcnet_activation: linear
+            vf_share_layers: true
+        # Double batch size (2 GPUs).
+        train_batch_size: 1000
+
+appo-cartpole-v0-vtrace:
+    env: CartPole-v0
+    run: APPO
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 500000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 5
+        lr: 0.0003
+        observation_filter: MeanStdFilter
+        num_sgd_iter: 6
+        vf_loss_coeff: 0.01
+        model:
+            fcnet_hiddens: [32]
+            fcnet_activation: linear
+            vf_share_layers: true
+        # Double batch size (2 GPUs).
+        train_batch_size: 1000
+
+ddpg-repeat-after-me-env:
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
+    run: DDPG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: -50.0
+        timesteps_total: 8000
+    stop:
+        time_total_s: 600
+    config:
+        env_config:
+            config:
+                continuous: true
+                repeat_delay: 0
+
+        num_gpus: 2
+        num_workers: 0
+        # Double batch size (2 GPUs).
+        train_batch_size: 512
+
+dqn-cartpole-v0:
+    env: CartPole-v0
+    run: DQN
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 50000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 0
+        # Double batch size (2 GPUs).
+        train_batch_size: 64
+        # Mimic tuned_example for DQN CartPole.
+        n_step: 3
+        model:
+            fcnet_hiddens: [64]
+            fcnet_activation: linear
+
+impala-cartpole-v0:
+    env: CartPole-v0
+    run: IMPALA
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 500000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 23
+        # Double batch size (2 GPUs).
+        train_batch_size: 1000
+
+pg-cartpole-v0:
+    env: CartPole-v0
+    run: PG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 130.0
+        timesteps_total: 500000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 23
+        # Double batch size (2 GPUs).
+        train_batch_size: 400
+        model:
+            fcnet_hiddens: [64]
+            fcnet_activation: linear
+
+ppo-cartpole-v0:
+    env: CartPole-v0
+    run: PPO
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 150.0
+        timesteps_total: 300000
+    stop:
+        time_total_s: 600
+    config:
+        num_gpus: 2
+        num_workers: 23
+        lr: 0.0003
+        observation_filter: MeanStdFilter
+        num_sgd_iter: 6
+        vf_loss_coeff: 0.01
+        model:
+            fcnet_hiddens: [32]
+            fcnet_activation: linear
+            vf_share_layers: true
+        # Double batch size (2 GPUs).
+        train_batch_size: 8000
+
+sac-repeat-after-me-env:
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
+    run: SAC
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 40.0
+        timesteps_total: 4500
+    stop:
+        time_total_s: 600
+    config:
+        env_config:
+            config:
+                repeat_delay: 0
+        num_gpus: 2
+        num_workers: 0
+        replay_buffer_config:
+          type: MultiAgentPrioritizedReplayBuffer
+        initial_alpha: 0.001
+        # Double batch size (2 GPUs).
+        train_batch_size: 512
+
+sac-repeat-after-me-env-continuous:
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
+    run: SAC
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: -50.0
+        timesteps_total: 4500
+    stop:
+        time_total_s: 600
+    config:
+        env_config:
+            config:
+                continuous: true
+                repeat_delay: 0
+        replay_buffer_config:
+          type: MultiAgentPrioritizedReplayBuffer
+        num_gpus: 2
+        num_workers: 0
+        initial_alpha: 0.001
+        # Double batch size (2 GPUs).
+        train_batch_size: 512
+
 simpleq-cartpole-v0:
     env: CartPole-v0
     run: SimpleQ
@@ -10,3 +211,23 @@ simpleq-cartpole-v0:
     config:
         num_gpus: 2
         num_workers: 0
+
+td3-repeat-after-me-env:
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
+    run: TD3
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: -50.0
+        timesteps_total: 25000
+    stop:
+        time_total_s: 600
+    config:
+        env_config:
+            config:
+                continuous: true
+                repeat_delay: 0
+
+        num_gpus: 2
+        num_workers: 0
+        # Double batch size (2 GPUs).
+        train_batch_size: 200