diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 14ea3c43aea3..0abcb01a2124 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -2796,8 +2796,8 @@
     cluster_compute: 8gpus_96cpus.yaml
 
   run:
-    timeout: 600
-    script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --as-release-test
+    timeout: 1200
+    script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env=ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --stop-reward=20.0 --as-release-test
 
   alert: default
 
diff --git a/rllib/BUILD b/rllib/BUILD
index 20908f0d9060..6948f17e903c 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2379,7 +2379,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"]
+    args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"]
 )
 
 py_test(
@@ -2388,7 +2388,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples", "examples_use_all_core"],
     size = "large",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"]
+    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"]
 )
 
 py_test(
@@ -2397,7 +2397,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"]
+    args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"]
 )
 
 py_test(
@@ -2406,7 +2406,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"]
+    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"]
 )
 
 py_test(
@@ -2415,7 +2415,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"]
+    args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"]
 )
 
 py_test(
@@ -2424,7 +2424,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"]
+    args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"]
 )
 
 # @OldAPIStack
@@ -2434,7 +2434,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"]
+    args = ["--as-test", "--evaluation-parallel-to-training", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"]
 )
 
 # @OldAPIStack
@@ -2444,7 +2444,7 @@ py_test(
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
     srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
-    args = ["--as-test", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"]
+    args = ["--as-test", "--evaluation-parallel-to-training", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"]
 )
 
 # subdirectory: gpus/
diff --git a/rllib/benchmarks/ppo/benchmark_atari_ppo.py b/rllib/benchmarks/ppo/benchmark_atari_ppo.py
new file mode 100644
index 000000000000..0b697ff4b902
--- /dev/null
+++ b/rllib/benchmarks/ppo/benchmark_atari_ppo.py
@@ -0,0 +1,123 @@
+"""Script to execute RLlib's official PPO Atari benchmarks.
+
+How to run this script
+----------------------
+`python [script-name].py --enable-new-api-stack --stop-timesteps 12000000
+--num-gpus=4 --num-env-runners=95`
+
+In order to only run individual or lists of envs, you can provide a list of env-strings
+under the `--env` arg, such as `--env ALE/Pong-v5,ALE/Breakout-v5`.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+TODO (sven): Link to RLlib's to-be-created benchmark page.
+"""
+import subprocess
+
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+
+parser = add_rllib_example_script_args()
+
+# Might need `gymnasium[atari, other]` to be installed.
+
+# See the following links for benchmark results of other libraries:
+#   Original paper: https://arxiv.org/abs/1812.05905
+#   CleanRL: https://wandb.ai/cleanrl/cleanrl.benchmark/reports/Mujoco--VmlldzoxODE0NjE
+#   AgileRL: https://github.com/AgileRL/AgileRL?tab=readme-ov-file#benchmarks
+# [0] = reward to expect for DQN rainbow [1] = timesteps to run (always 200M for DQN
+# rainbow).
+# Note that for PPO, we simply run everything for 6M ts.
+benchmark_envs = {
+    "ALE/Alien-v5": (6022.9, 200000000),
+    "ALE/Amidar-v5": (202.8, 200000000),
+    "ALE/Assault-v5": (14491.7, 200000000),
+    "ALE/Asterix-v5": (280114.0, 200000000),
+    "ALE/Asteroids-v5": (2249.4, 200000000),
+    "ALE/Atlantis-v5": (814684.0, 200000000),
+    "ALE/BankHeist-v5": (826.0, 200000000),
+    "ALE/BattleZone-v5": (52040.0, 200000000),
+    "ALE/BeamRider-v5": (21768.5, 200000000),
+    "ALE/Berzerk-v5": (1793.4, 200000000),
+    "ALE/Bowling-v5": (39.4, 200000000),
+    "ALE/Boxing-v5": (54.9, 200000000),
+    "ALE/Breakout-v5": (379.5, 200000000),
+    "ALE/Centipede-v5": (7160.9, 200000000),
+    "ALE/ChopperCommand-v5": (10916.0, 200000000),
+    "ALE/CrazyClimber-v5": (143962.0, 200000000),
+    "ALE/Defender-v5": (47671.3, 200000000),
+    "ALE/DemonAttack-v5": (109670.7, 200000000),
+    "ALE/DoubleDunk-v5": (-0.6, 200000000),
+    "ALE/Enduro-v5": (2061.1, 200000000),
+    "ALE/FishingDerby-v5": (22.6, 200000000),
+    "ALE/Freeway-v5": (29.1, 200000000),
+    "ALE/Frostbite-v5": (4141.1, 200000000),
+    "ALE/Gopher-v5": (72595.7, 200000000),
+    "ALE/Gravitar-v5": (567.5, 200000000),
+    "ALE/Hero-v5": (50496.8, 200000000),
+    "ALE/IceHockey-v5": (-11685.8, 200000000),
+    "ALE/Kangaroo-v5": (10841.0, 200000000),
+    "ALE/Krull-v5": (6715.5, 200000000),
+    "ALE/KungFuMaster-v5": (28999.8, 200000000),
+    "ALE/MontezumaRevenge-v5": (154.0, 200000000),
+    "ALE/MsPacman-v5": (2570.2, 200000000),
+    "ALE/NameThisGame-v5": (11686.5, 200000000),
+    "ALE/Phoenix-v5": (103061.6, 200000000),
+    "ALE/Pitfall-v5": (-37.6, 200000000),
+    "ALE/Pong-v5": (19.0, 200000000),
+    "ALE/PrivateEye-v5": (1704.4, 200000000),
+    "ALE/Qbert-v5": (18397.6, 200000000),
+    "ALE/RoadRunner-v5": (54261.0, 200000000),
+    "ALE/Robotank-v5": (55.2, 200000000),
+    "ALE/Seaquest-v5": (19176.0, 200000000),
+    "ALE/Skiing-v5": (-11685.8, 200000000),
+    "ALE/Solaris-v5": (2860.7, 200000000),
+    "ALE/SpaceInvaders-v5": (12629.0, 200000000),
+    "ALE/StarGunner-v5": (123853.0, 200000000),
+    "ALE/Surround-v5": (7.0, 200000000),
+    "ALE/Tennis-v5": (-2.2, 200000000),
+    "ALE/TimePilot-v5": (11190.5, 200000000),
+    "ALE/Tutankham-v5": (126.9, 200000000),
+    "ALE/Venture-v5": (45.0, 200000000),
+    "ALE/VideoPinball-v5": (506817.2, 200000000),
+    "ALE/WizardOfWor-v5": (14631.5, 200000000),
+    "ALE/YarsRevenge-v5": (93007.9, 200000000),
+    "ALE/Zaxxon-v5": (19658.0, 200000000),
+}
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Compile the base command running the actual `tuned_example` script.
+    base_commands = [
+        "python",
+        "../../tuned_examples/ppo/atari_ppo.py",
+        "--enable-new-api-stack",
+        f"--num-env-runners={args.num_env_runners}" if args.num_env_runners else "",
+        f"--num-gpus={args.num_gpus}",
+        f"--wandb-key={args.wandb_key}" if args.wandb_key else "",
+        f"--wandb-project={args.wandb_project}" if args.wandb_project else "",
+        f"--wandb-run-name={args.wandb_run_name}" if args.wandb_run_name else "",
+        f"--stop-timesteps={args.stop_timesteps}",
+        f"--checkpoint-freq={args.checkpoint_freq}",
+        "--checkpoint-at-end" if args.checkpoint_at_end else "",
+    ]
+
+    # Loop through all envs (given on command line or found in `benchmark_envs` and
+    # run the `tuned_example` script for each of them.
+    for env_name in args.env.split(",") if args.env else benchmark_envs.keys():
+        # Remove missing commands.
+        commands = []
+        for c in base_commands:
+            if c != "":
+                commands.append(c)
+        commands.append(f"--env={env_name}")
+        commands.append(f"--wandb-run-name={env_name}")
+        print(f"Running {env_name} through command line=`{commands}`")
+        subprocess.run(commands)
diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py
index 76aad3eccdf4..dce216147a32 100644
--- a/rllib/examples/evaluation/custom_evaluation.py
+++ b/rllib/examples/evaluation/custom_evaluation.py
@@ -85,7 +85,6 @@
 parser = add_rllib_example_script_args(
     default_iters=50, default_reward=0.7, default_timesteps=50000
 )
-parser.add_argument("--evaluation-parallel-to-training", action="store_true")
 parser.add_argument("--no-custom-eval", action="store_true")
 parser.add_argument("--corridor-length-training", type=int, default=10)
 parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py
index d0ab6b12e239..382cec33c77e 100644
--- a/rllib/examples/evaluation/evaluation_parallel_to_training.py
+++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py
@@ -53,7 +53,7 @@
 |          71.7485 | 100000 |   476.51 |             476.51 |
 +------------------+--------+----------+--------------------+
 
-When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag),
+When running without parallel evaluation (no `--evaluation-parallel-to-training` flag),
 the experiment takes considerably longer (~70sec vs ~80sec):
 +-----------------------------+------------+-----------------+--------+
 | Trial name                  | status     | loc             |   iter |
@@ -89,37 +89,10 @@
 from ray.tune.registry import get_trainable_cls, register_env
 
 parser = add_rllib_example_script_args(default_reward=500.0)
-parser.add_argument(
-    "--evaluation-duration",
-    type=lambda v: v if v == "auto" else int(v),
-    default="auto",
-    help="Number of evaluation episodes/timesteps to run each iteration. "
-    "If 'auto', will run as many as possible during train pass.",
-)
-parser.add_argument(
-    "--evaluation-duration-unit",
-    type=str,
-    default="timesteps",
-    choices=["episodes", "timesteps"],
-    help="The unit in which to measure the duration (`episodes` or `timesteps`).",
-)
-parser.add_argument(
-    "--evaluation-not-parallel-to-training",
-    action="store_true",
-    help="Whether to  NOT run evaluation parallel to training, but in sequence.",
-)
-parser.add_argument(
-    "--evaluation-num-env-runners",
-    type=int,
-    default=2,
-    help="The number of evaluation EnvRunners to setup. "
-    "0 for a single local evaluation EnvRunner.",
-)
-parser.add_argument(
-    "--evaluation-interval",
-    type=int,
-    default=1,
-    help="Every how many train iterations should we run an evaluation loop?",
+parser.set_defaults(
+    evaluation_num_env_runners=2,
+    evaluation_interval=1,
+    evaluation_duration_unit="timesteps",
 )
 parser.add_argument(
     "--evaluation-parallel-to-training-wo-thread",
@@ -219,9 +192,7 @@ def on_train_result(
         .evaluation(
             # Parallel evaluation+training config.
             # Switch on evaluation in parallel with training.
-            evaluation_parallel_to_training=(
-                not args.evaluation_not_parallel_to_training
-            ),
+            evaluation_parallel_to_training=args.evaluation_parallel_to_training,
             # Use two evaluation workers. Must be >0, otherwise,
             # evaluation will run on a local worker and block (no parallelism).
             evaluation_num_env_runners=args.evaluation_num_env_runners,
diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py
index ee76d8d3f9ce..1fc44dfc9a40 100644
--- a/rllib/tuned_examples/ppo/atari_ppo.py
+++ b/rllib/tuned_examples/ppo/atari_ppo.py
@@ -1,19 +1,18 @@
 import gymnasium as gym
 
+from ray import tune
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
 from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
 from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
-from ray import tune
 
 
-parser = add_rllib_example_script_args()
+parser = add_rllib_example_script_args(
+    default_reward=float("inf"),
+    default_timesteps=3000000,
+    default_iters=100000000000,
+)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values toset up `config` below.
 args = parser.parse_args()
@@ -81,13 +80,8 @@ def _env_creator(cfg):
     )
 )
 
-stop = {
-    f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME: 1500000,
-}
-
 
 if __name__ == "__main__":
     from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
 
-    run_rllib_example_script_experiment(config, args=args, stop=stop)
+    run_rllib_example_script_experiment(config, args=args)
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 6fe3f8069d43..38408356530e 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -135,6 +135,48 @@ def add_rllib_example_script_args(
         "experiment is then the sum over all individual agents' rewards.",
     )
 
+    # Evaluation options.
+    parser.add_argument(
+        "--evaluation-num-env-runners",
+        type=int,
+        default=0,
+        help="The number of evaluation (remote) EnvRunners to use for the experiment.",
+    )
+    parser.add_argument(
+        "--evaluation-interval",
+        type=int,
+        default=0,
+        help="Every how many iterations to run one round of evaluation. "
+        "Use 0 (default) to disable evaluation.",
+    )
+    parser.add_argument(
+        "--evaluation-duration",
+        type=lambda v: v if v == "auto" else int(v),
+        default=10,
+        help="The number of evaluation units to run each evaluation round. "
+        "Use `--evaluation-duration-unit` to count either in 'episodes' "
+        "or 'timesteps'. If 'auto', will run as many as possible during train pass ("
+        "`--evaluation-parallel-to-training` must be set then).",
+    )
+    parser.add_argument(
+        "--evaluation-duration-unit",
+        type=str,
+        default="episodes",
+        choices=["episodes", "timesteps"],
+        help="The evaluation duration unit to count by. One of 'episodes' or "
+        "'timesteps'. This unit will be run `--evaluation-duration` times in each "
+        "evaluation round. If `--evaluation-duration=auto`, this setting does not "
+        "matter.",
+    )
+    parser.add_argument(
+        "--evaluation-parallel-to-training",
+        action="store_true",
+        help="Whether to run evaluation parallel to training. This might help speed up "
+        "your overall iteration time. Be aware that when using this option, your "
+        "reported evaluation results are referring to one iteration before the current "
+        "one.",
+    )
+
     # tune.Tuner options.
     parser.add_argument(
         "--no-tune",
@@ -1434,6 +1476,16 @@ def run_rllib_example_script_experiment(
                 num_cpus_for_main_process=1,
             )
 
+        # Evaluation setup.
+        if args.evaluation_interval > 0:
+            config.evaluation(
+                evaluation_num_env_runners=args.evaluation_num_env_runners,
+                evaluation_interval=args.evaluation_interval,
+                evaluation_duration=args.evaluation_duration,
+                evaluation_duration_unit=args.evaluation_duration_unit,
+                evaluation_parallel_to_training=args.evaluation_parallel_to_training,
+            )
+
     # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object).
     if args.no_tune:
         assert not args.as_test and not args.as_release_test