Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Add "official" benchmark script for Atari PPO benchmarks (new API stack). #45697

Merged
merged 8 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2796,8 +2796,8 @@
cluster_compute: 8gpus_96cpus.yaml

run:
timeout: 600
script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --as-release-test
timeout: 1200
script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env=ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --stop-reward=20.0 --as-release-test

alert: default

Expand Down
16 changes: 8 additions & 8 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2379,7 +2379,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"]
args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"]
)

py_test(
Expand All @@ -2388,7 +2388,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples", "examples_use_all_core"],
size = "large",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"]
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"]
)

py_test(
Expand All @@ -2397,7 +2397,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"]
args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"]
)

py_test(
Expand All @@ -2406,7 +2406,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"]
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"]
)

py_test(
Expand All @@ -2415,7 +2415,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"]
args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"]
)

py_test(
Expand All @@ -2424,7 +2424,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"]
args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"]
)

# @OldAPIStack
Expand All @@ -2434,7 +2434,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"]
args = ["--as-test", "--evaluation-parallel-to-training", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"]
)

# @OldAPIStack
Expand All @@ -2444,7 +2444,7 @@ py_test(
tags = ["team:rllib", "exclusive", "examples"],
size = "medium",
srcs = ["examples/evaluation/evaluation_parallel_to_training.py"],
args = ["--as-test", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"]
args = ["--as-test", "--evaluation-parallel-to-training", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"]
)

# subdirectory: gpus/
Expand Down
123 changes: 123 additions & 0 deletions rllib/benchmarks/ppo/benchmark_atari_ppo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Script to execute RLlib's official PPO Atari benchmarks.

How to run this script
----------------------
`python [script-name].py --enable-new-api-stack --stop-timesteps 12000000
--num-gpus=4 --num-env-runners=95`

In order to only run individual or lists of envs, you can provide a list of env-strings
under the `--env` arg, such as `--env ALE/Pong-v5,ALE/Breakout-v5`.

For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`


Results to expect
-----------------
TODO (sven): Link to RLlib's to-be-created benchmark page.
"""
import subprocess

from ray.rllib.utils.test_utils import add_rllib_example_script_args


parser = add_rllib_example_script_args()

# Might need `gymnasium[atari, other]` to be installed.

# See the following links for benchmark results of other libraries:
# Original paper: https://arxiv.org/abs/1812.05905
# CleanRL: https://wandb.ai/cleanrl/cleanrl.benchmark/reports/Mujoco--VmlldzoxODE0NjE
# AgileRL: https://github.com/AgileRL/AgileRL?tab=readme-ov-file#benchmarks
# [0] = reward to expect for DQN rainbow [1] = timesteps to run (always 200M for DQN
# rainbow).
# Note that for PPO, we simply run everything for 6M ts.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In atari_ppo.py the timesteps are set to 3M.

benchmark_envs = {
"ALE/Alien-v5": (6022.9, 200000000),
"ALE/Amidar-v5": (202.8, 200000000),
"ALE/Assault-v5": (14491.7, 200000000),
"ALE/Asterix-v5": (280114.0, 200000000),
"ALE/Asteroids-v5": (2249.4, 200000000),
"ALE/Atlantis-v5": (814684.0, 200000000),
"ALE/BankHeist-v5": (826.0, 200000000),
"ALE/BattleZone-v5": (52040.0, 200000000),
"ALE/BeamRider-v5": (21768.5, 200000000),
"ALE/Berzerk-v5": (1793.4, 200000000),
"ALE/Bowling-v5": (39.4, 200000000),
"ALE/Boxing-v5": (54.9, 200000000),
"ALE/Breakout-v5": (379.5, 200000000),
"ALE/Centipede-v5": (7160.9, 200000000),
"ALE/ChopperCommand-v5": (10916.0, 200000000),
"ALE/CrazyClimber-v5": (143962.0, 200000000),
"ALE/Defender-v5": (47671.3, 200000000),
"ALE/DemonAttack-v5": (109670.7, 200000000),
"ALE/DoubleDunk-v5": (-0.6, 200000000),
"ALE/Enduro-v5": (2061.1, 200000000),
"ALE/FishingDerby-v5": (22.6, 200000000),
"ALE/Freeway-v5": (29.1, 200000000),
"ALE/Frostbite-v5": (4141.1, 200000000),
"ALE/Gopher-v5": (72595.7, 200000000),
"ALE/Gravitar-v5": (567.5, 200000000),
"ALE/Hero-v5": (50496.8, 200000000),
"ALE/IceHockey-v5": (-11685.8, 200000000),
"ALE/Kangaroo-v5": (10841.0, 200000000),
"ALE/Krull-v5": (6715.5, 200000000),
"ALE/KungFuMaster-v5": (28999.8, 200000000),
"ALE/MontezumaRevenge-v5": (154.0, 200000000),
"ALE/MsPacman-v5": (2570.2, 200000000),
"ALE/NameThisGame-v5": (11686.5, 200000000),
"ALE/Phoenix-v5": (103061.6, 200000000),
"ALE/Pitfall-v5": (-37.6, 200000000),
"ALE/Pong-v5": (19.0, 200000000),
"ALE/PrivateEye-v5": (1704.4, 200000000),
"ALE/Qbert-v5": (18397.6, 200000000),
"ALE/RoadRunner-v5": (54261.0, 200000000),
"ALE/Robotank-v5": (55.2, 200000000),
"ALE/Seaquest-v5": (19176.0, 200000000),
"ALE/Skiing-v5": (-11685.8, 200000000),
"ALE/Solaris-v5": (2860.7, 200000000),
"ALE/SpaceInvaders-v5": (12629.0, 200000000),
"ALE/StarGunner-v5": (123853.0, 200000000),
"ALE/Surround-v5": (7.0, 200000000),
"ALE/Tennis-v5": (-2.2, 200000000),
"ALE/TimePilot-v5": (11190.5, 200000000),
"ALE/Tutankham-v5": (126.9, 200000000),
"ALE/Venture-v5": (45.0, 200000000),
"ALE/VideoPinball-v5": (506817.2, 200000000),
"ALE/WizardOfWor-v5": (14631.5, 200000000),
"ALE/YarsRevenge-v5": (93007.9, 200000000),
"ALE/Zaxxon-v5": (19658.0, 200000000),
}


if __name__ == "__main__":
args = parser.parse_args()

# Compile the base command running the actual `tuned_example` script.
base_commands = [
"python",
"../../tuned_examples/ppo/atari_ppo.py",
"--enable-new-api-stack",
f"--num-env-runners={args.num_env_runners}" if args.num_env_runners else "",
f"--num-gpus={args.num_gpus}",
f"--wandb-key={args.wandb_key}" if args.wandb_key else "",
f"--wandb-project={args.wandb_project}" if args.wandb_project else "",
f"--wandb-run-name={args.wandb_run_name}" if args.wandb_run_name else "",
f"--stop-timesteps={args.stop_timesteps}",
f"--checkpoint-freq={args.checkpoint_freq}",
"--checkpoint-at-end" if args.checkpoint_at_end else "",
]

# Loop through all envs (given on command line or found in `benchmark_envs` and
# run the `tuned_example` script for each of them.
for env_name in args.env.split(",") if args.env else benchmark_envs.keys():
# Remove missing commands.
commands = []
for c in base_commands:
if c != "":
commands.append(c)
commands.append(f"--env={env_name}")
commands.append(f"--wandb-run-name={env_name}")
print(f"Running {env_name} through command line=`{commands}`")
subprocess.run(commands)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is somehow strange to me that we emulate running from the command line which in turn runs a script that could have been triggered direclty in the loop. It makes sense to me that users can run single envs, but why not triggering them directly in the loop?

1 change: 0 additions & 1 deletion rllib/examples/evaluation/custom_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@
parser = add_rllib_example_script_args(
default_iters=50, default_reward=0.7, default_timesteps=50000
)
parser.add_argument("--evaluation-parallel-to-training", action="store_true")
parser.add_argument("--no-custom-eval", action="store_true")
parser.add_argument("--corridor-length-training", type=int, default=10)
parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20)
Expand Down
41 changes: 6 additions & 35 deletions rllib/examples/evaluation/evaluation_parallel_to_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
| 71.7485 | 100000 | 476.51 | 476.51 |
+------------------+--------+----------+--------------------+

When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag),
When running without parallel evaluation (no `--evaluation-parallel-to-training` flag),
the experiment takes considerably longer (~70sec vs ~80sec):
+-----------------------------+------------+-----------------+--------+
| Trial name | status | loc | iter |
Expand Down Expand Up @@ -89,37 +89,10 @@
from ray.tune.registry import get_trainable_cls, register_env

parser = add_rllib_example_script_args(default_reward=500.0)
parser.add_argument(
"--evaluation-duration",
type=lambda v: v if v == "auto" else int(v),
default="auto",
help="Number of evaluation episodes/timesteps to run each iteration. "
"If 'auto', will run as many as possible during train pass.",
)
parser.add_argument(
"--evaluation-duration-unit",
type=str,
default="timesteps",
choices=["episodes", "timesteps"],
help="The unit in which to measure the duration (`episodes` or `timesteps`).",
)
parser.add_argument(
"--evaluation-not-parallel-to-training",
action="store_true",
help="Whether to NOT run evaluation parallel to training, but in sequence.",
)
parser.add_argument(
"--evaluation-num-env-runners",
type=int,
default=2,
help="The number of evaluation EnvRunners to setup. "
"0 for a single local evaluation EnvRunner.",
)
parser.add_argument(
"--evaluation-interval",
type=int,
default=1,
help="Every how many train iterations should we run an evaluation loop?",
parser.set_defaults(
evaluation_num_env_runners=2,
evaluation_interval=1,
evaluation_duration_unit="timesteps",
)
parser.add_argument(
"--evaluation-parallel-to-training-wo-thread",
Expand Down Expand Up @@ -219,9 +192,7 @@ def on_train_result(
.evaluation(
# Parallel evaluation+training config.
# Switch on evaluation in parallel with training.
evaluation_parallel_to_training=(
not args.evaluation_not_parallel_to_training
),
evaluation_parallel_to_training=args.evaluation_parallel_to_training,
# Use two evaluation workers. Must be >0, otherwise,
# evaluation will run on a local worker and block (no parallelism).
evaluation_num_env_runners=args.evaluation_num_env_runners,
Expand Down
20 changes: 7 additions & 13 deletions rllib/tuned_examples/ppo/atari_ppo.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import gymnasium as gym

from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray.rllib.utils.test_utils import add_rllib_example_script_args
from ray import tune


parser = add_rllib_example_script_args()
parser = add_rllib_example_script_args(
default_reward=float("inf"),
default_timesteps=3000000,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we have set 3M timesteps while above in the benchmark_atari_ppo.py we comment on 6M.

default_iters=100000000000,
)
# Use `parser` to add your own custom command line options to this script
# and (if needed) use their values toset up `config` below.
args = parser.parse_args()
Expand Down Expand Up @@ -81,13 +80,8 @@ def _env_creator(cfg):
)
)

stop = {
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0,
NUM_ENV_STEPS_SAMPLED_LIFETIME: 1500000,
}


if __name__ == "__main__":
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment

run_rllib_example_script_experiment(config, args=args, stop=stop)
run_rllib_example_script_experiment(config, args=args)
52 changes: 52 additions & 0 deletions rllib/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,48 @@ def add_rllib_example_script_args(
"experiment is then the sum over all individual agents' rewards.",
)

# Evaluation options.
parser.add_argument(
"--evaluation-num-env-runners",
type=int,
default=0,
help="The number of evaluation (remote) EnvRunners to use for the experiment.",
)
parser.add_argument(
"--evaluation-interval",
type=int,
default=0,
help="Every how many iterations to run one round of evaluation. "
"Use 0 (default) to disable evaluation.",
)
parser.add_argument(
"--evaluation-duration",
type=lambda v: v if v == "auto" else int(v),
default=10,
help="The number of evaluation units to run each evaluation round. "
"Use `--evaluation-duration-unit` to count either in 'episodes' "
"or 'timesteps'. If 'auto', will run as many as possible during train pass ("
"`--evaluation-parallel-to-training` must be set then).",
)
parser.add_argument(
"--evaluation-duration-unit",
type=str,
default="episodes",
choices=["episodes", "timesteps"],
help="The evaluation duration unit to count by. One of 'episodes' or "
"'timesteps'. This unit will be run `--evaluation-duration` times in each "
"evaluation round. If `--evaluation-duration=auto`, this setting does not "
"matter.",
)
parser.add_argument(
"--evaluation-parallel-to-training",
action="store_true",
help="Whether to run evaluation parallel to training. This might help speed up "
"your overall iteration time. Be aware that when using this option, your "
"reported evaluation results are referring to one iteration before the current "
"one.",
)

# tune.Tuner options.
parser.add_argument(
"--no-tune",
Expand Down Expand Up @@ -1434,6 +1476,16 @@ def run_rllib_example_script_experiment(
num_cpus_for_main_process=1,
)

# Evaluation setup.
if args.evaluation_interval > 0:
config.evaluation(
evaluation_num_env_runners=args.evaluation_num_env_runners,
evaluation_interval=args.evaluation_interval,
evaluation_duration=args.evaluation_duration,
evaluation_duration_unit=args.evaluation_duration_unit,
evaluation_parallel_to_training=args.evaluation_parallel_to_training,
)

# Run the experiment w/o Tune (directly operate on the RLlib Algorithm object).
if args.no_tune:
assert not args.as_test and not args.as_release_test
Expand Down
Loading