diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 14ea3c43aea3..0abcb01a2124 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -2796,8 +2796,8 @@ cluster_compute: 8gpus_96cpus.yaml run: - timeout: 600 - script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --as-release-test + timeout: 1200 + script: python learning_tests/tuned_examples/ppo/atari_ppo.py --enable-new-api-stack --env=ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 --stop-reward=20.0 --as-release-test alert: default diff --git a/rllib/BUILD b/rllib/BUILD index 20908f0d9060..6948f17e903c 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2379,7 +2379,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"] + args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=auto"] ) py_test( @@ -2388,7 +2388,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples", "examples_use_all_core"], size = "large", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"] + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=auto", "--evaluation-duration-unit=episodes"] ) py_test( @@ -2397,7 +2397,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"] + args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=511", "--evaluation-duration-unit=timesteps"] ) py_test( @@ -2406,7 +2406,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"] + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=1001", "--evaluation-duration-unit=timesteps"] ) py_test( @@ -2415,7 +2415,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--as-test", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"] + args = ["--enable-new-api-stack", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=450.0", "--num-cpus=6", "--evaluation-duration=13", "--evaluation-duration-unit=episodes"] ) py_test( @@ -2424,7 +2424,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"] + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--evaluation-parallel-to-training", "--stop-reward=900.0", "--num-cpus=6", "--evaluation-duration=10", "--evaluation-duration-unit=episodes"] ) # @OldAPIStack @@ -2434,7 +2434,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"] + args = ["--as-test", "--evaluation-parallel-to-training", "--stop-reward=50.0", "--num-cpus=6", "--evaluation-duration=auto"] ) # @OldAPIStack @@ -2444,7 +2444,7 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/evaluation/evaluation_parallel_to_training.py"], - args = ["--as-test", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"] + args = ["--as-test", "--evaluation-parallel-to-training", "--framework=torch", "--stop-reward=30.0", "--num-cpus=6", "--evaluation-num-env-runners=3", "--evaluation-duration=211", "--evaluation-duration-unit=timesteps"] ) # subdirectory: gpus/ diff --git a/rllib/benchmarks/ppo/benchmark_atari_ppo.py b/rllib/benchmarks/ppo/benchmark_atari_ppo.py new file mode 100644 index 000000000000..0b697ff4b902 --- /dev/null +++ b/rllib/benchmarks/ppo/benchmark_atari_ppo.py @@ -0,0 +1,123 @@ +"""Script to execute RLlib's official PPO Atari benchmarks. + +How to run this script +---------------------- +`python [script-name].py --enable-new-api-stack --stop-timesteps 12000000 +--num-gpus=4 --num-env-runners=95` + +In order to only run individual or lists of envs, you can provide a list of env-strings +under the `--env` arg, such as `--env ALE/Pong-v5,ALE/Breakout-v5`. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +TODO (sven): Link to RLlib's to-be-created benchmark page. +""" +import subprocess + +from ray.rllib.utils.test_utils import add_rllib_example_script_args + + +parser = add_rllib_example_script_args() + +# Might need `gymnasium[atari, other]` to be installed. + +# See the following links for benchmark results of other libraries: +# Original paper: https://arxiv.org/abs/1812.05905 +# CleanRL: https://wandb.ai/cleanrl/cleanrl.benchmark/reports/Mujoco--VmlldzoxODE0NjE +# AgileRL: https://github.com/AgileRL/AgileRL?tab=readme-ov-file#benchmarks +# [0] = reward to expect for DQN rainbow [1] = timesteps to run (always 200M for DQN +# rainbow). +# Note that for PPO, we simply run everything for 6M ts. +benchmark_envs = { + "ALE/Alien-v5": (6022.9, 200000000), + "ALE/Amidar-v5": (202.8, 200000000), + "ALE/Assault-v5": (14491.7, 200000000), + "ALE/Asterix-v5": (280114.0, 200000000), + "ALE/Asteroids-v5": (2249.4, 200000000), + "ALE/Atlantis-v5": (814684.0, 200000000), + "ALE/BankHeist-v5": (826.0, 200000000), + "ALE/BattleZone-v5": (52040.0, 200000000), + "ALE/BeamRider-v5": (21768.5, 200000000), + "ALE/Berzerk-v5": (1793.4, 200000000), + "ALE/Bowling-v5": (39.4, 200000000), + "ALE/Boxing-v5": (54.9, 200000000), + "ALE/Breakout-v5": (379.5, 200000000), + "ALE/Centipede-v5": (7160.9, 200000000), + "ALE/ChopperCommand-v5": (10916.0, 200000000), + "ALE/CrazyClimber-v5": (143962.0, 200000000), + "ALE/Defender-v5": (47671.3, 200000000), + "ALE/DemonAttack-v5": (109670.7, 200000000), + "ALE/DoubleDunk-v5": (-0.6, 200000000), + "ALE/Enduro-v5": (2061.1, 200000000), + "ALE/FishingDerby-v5": (22.6, 200000000), + "ALE/Freeway-v5": (29.1, 200000000), + "ALE/Frostbite-v5": (4141.1, 200000000), + "ALE/Gopher-v5": (72595.7, 200000000), + "ALE/Gravitar-v5": (567.5, 200000000), + "ALE/Hero-v5": (50496.8, 200000000), + "ALE/IceHockey-v5": (-11685.8, 200000000), + "ALE/Kangaroo-v5": (10841.0, 200000000), + "ALE/Krull-v5": (6715.5, 200000000), + "ALE/KungFuMaster-v5": (28999.8, 200000000), + "ALE/MontezumaRevenge-v5": (154.0, 200000000), + "ALE/MsPacman-v5": (2570.2, 200000000), + "ALE/NameThisGame-v5": (11686.5, 200000000), + "ALE/Phoenix-v5": (103061.6, 200000000), + "ALE/Pitfall-v5": (-37.6, 200000000), + "ALE/Pong-v5": (19.0, 200000000), + "ALE/PrivateEye-v5": (1704.4, 200000000), + "ALE/Qbert-v5": (18397.6, 200000000), + "ALE/RoadRunner-v5": (54261.0, 200000000), + "ALE/Robotank-v5": (55.2, 200000000), + "ALE/Seaquest-v5": (19176.0, 200000000), + "ALE/Skiing-v5": (-11685.8, 200000000), + "ALE/Solaris-v5": (2860.7, 200000000), + "ALE/SpaceInvaders-v5": (12629.0, 200000000), + "ALE/StarGunner-v5": (123853.0, 200000000), + "ALE/Surround-v5": (7.0, 200000000), + "ALE/Tennis-v5": (-2.2, 200000000), + "ALE/TimePilot-v5": (11190.5, 200000000), + "ALE/Tutankham-v5": (126.9, 200000000), + "ALE/Venture-v5": (45.0, 200000000), + "ALE/VideoPinball-v5": (506817.2, 200000000), + "ALE/WizardOfWor-v5": (14631.5, 200000000), + "ALE/YarsRevenge-v5": (93007.9, 200000000), + "ALE/Zaxxon-v5": (19658.0, 200000000), +} + + +if __name__ == "__main__": + args = parser.parse_args() + + # Compile the base command running the actual `tuned_example` script. + base_commands = [ + "python", + "../../tuned_examples/ppo/atari_ppo.py", + "--enable-new-api-stack", + f"--num-env-runners={args.num_env_runners}" if args.num_env_runners else "", + f"--num-gpus={args.num_gpus}", + f"--wandb-key={args.wandb_key}" if args.wandb_key else "", + f"--wandb-project={args.wandb_project}" if args.wandb_project else "", + f"--wandb-run-name={args.wandb_run_name}" if args.wandb_run_name else "", + f"--stop-timesteps={args.stop_timesteps}", + f"--checkpoint-freq={args.checkpoint_freq}", + "--checkpoint-at-end" if args.checkpoint_at_end else "", + ] + + # Loop through all envs (given on command line or found in `benchmark_envs` and + # run the `tuned_example` script for each of them. + for env_name in args.env.split(",") if args.env else benchmark_envs.keys(): + # Remove missing commands. + commands = [] + for c in base_commands: + if c != "": + commands.append(c) + commands.append(f"--env={env_name}") + commands.append(f"--wandb-run-name={env_name}") + print(f"Running {env_name} through command line=`{commands}`") + subprocess.run(commands) diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index 76aad3eccdf4..dce216147a32 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -85,7 +85,6 @@ parser = add_rllib_example_script_args( default_iters=50, default_reward=0.7, default_timesteps=50000 ) -parser.add_argument("--evaluation-parallel-to-training", action="store_true") parser.add_argument("--no-custom-eval", action="store_true") parser.add_argument("--corridor-length-training", type=int, default=10) parser.add_argument("--corridor-length-eval-worker-1", type=int, default=20) diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index d0ab6b12e239..382cec33c77e 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -53,7 +53,7 @@ | 71.7485 | 100000 | 476.51 | 476.51 | +------------------+--------+----------+--------------------+ -When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag), +When running without parallel evaluation (no `--evaluation-parallel-to-training` flag), the experiment takes considerably longer (~70sec vs ~80sec): +-----------------------------+------------+-----------------+--------+ | Trial name | status | loc | iter | @@ -89,37 +89,10 @@ from ray.tune.registry import get_trainable_cls, register_env parser = add_rllib_example_script_args(default_reward=500.0) -parser.add_argument( - "--evaluation-duration", - type=lambda v: v if v == "auto" else int(v), - default="auto", - help="Number of evaluation episodes/timesteps to run each iteration. " - "If 'auto', will run as many as possible during train pass.", -) -parser.add_argument( - "--evaluation-duration-unit", - type=str, - default="timesteps", - choices=["episodes", "timesteps"], - help="The unit in which to measure the duration (`episodes` or `timesteps`).", -) -parser.add_argument( - "--evaluation-not-parallel-to-training", - action="store_true", - help="Whether to NOT run evaluation parallel to training, but in sequence.", -) -parser.add_argument( - "--evaluation-num-env-runners", - type=int, - default=2, - help="The number of evaluation EnvRunners to setup. " - "0 for a single local evaluation EnvRunner.", -) -parser.add_argument( - "--evaluation-interval", - type=int, - default=1, - help="Every how many train iterations should we run an evaluation loop?", +parser.set_defaults( + evaluation_num_env_runners=2, + evaluation_interval=1, + evaluation_duration_unit="timesteps", ) parser.add_argument( "--evaluation-parallel-to-training-wo-thread", @@ -219,9 +192,7 @@ def on_train_result( .evaluation( # Parallel evaluation+training config. # Switch on evaluation in parallel with training. - evaluation_parallel_to_training=( - not args.evaluation_not_parallel_to_training - ), + evaluation_parallel_to_training=args.evaluation_parallel_to_training, # Use two evaluation workers. Must be >0, otherwise, # evaluation will run on a local worker and block (no parallelism). evaluation_num_env_runners=args.evaluation_num_env_runners, diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py index ee76d8d3f9ce..1fc44dfc9a40 100644 --- a/rllib/tuned_examples/ppo/atari_ppo.py +++ b/rllib/tuned_examples/ppo/atari_ppo.py @@ -1,19 +1,18 @@ import gymnasium as gym +from ray import tune from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import add_rllib_example_script_args -from ray import tune -parser = add_rllib_example_script_args() +parser = add_rllib_example_script_args( + default_reward=float("inf"), + default_timesteps=3000000, + default_iters=100000000000, +) # Use `parser` to add your own custom command line options to this script # and (if needed) use their values toset up `config` below. args = parser.parse_args() @@ -81,13 +80,8 @@ def _env_creator(cfg): ) ) -stop = { - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 20.0, - NUM_ENV_STEPS_SAMPLED_LIFETIME: 1500000, -} - if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - run_rllib_example_script_experiment(config, args=args, stop=stop) + run_rllib_example_script_experiment(config, args=args) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 6fe3f8069d43..38408356530e 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -135,6 +135,48 @@ def add_rllib_example_script_args( "experiment is then the sum over all individual agents' rewards.", ) + # Evaluation options. + parser.add_argument( + "--evaluation-num-env-runners", + type=int, + default=0, + help="The number of evaluation (remote) EnvRunners to use for the experiment.", + ) + parser.add_argument( + "--evaluation-interval", + type=int, + default=0, + help="Every how many iterations to run one round of evaluation. " + "Use 0 (default) to disable evaluation.", + ) + parser.add_argument( + "--evaluation-duration", + type=lambda v: v if v == "auto" else int(v), + default=10, + help="The number of evaluation units to run each evaluation round. " + "Use `--evaluation-duration-unit` to count either in 'episodes' " + "or 'timesteps'. If 'auto', will run as many as possible during train pass (" + "`--evaluation-parallel-to-training` must be set then).", + ) + parser.add_argument( + "--evaluation-duration-unit", + type=str, + default="episodes", + choices=["episodes", "timesteps"], + help="The evaluation duration unit to count by. One of 'episodes' or " + "'timesteps'. This unit will be run `--evaluation-duration` times in each " + "evaluation round. If `--evaluation-duration=auto`, this setting does not " + "matter.", + ) + parser.add_argument( + "--evaluation-parallel-to-training", + action="store_true", + help="Whether to run evaluation parallel to training. This might help speed up " + "your overall iteration time. Be aware that when using this option, your " + "reported evaluation results are referring to one iteration before the current " + "one.", + ) + # tune.Tuner options. parser.add_argument( "--no-tune", @@ -1434,6 +1476,16 @@ def run_rllib_example_script_experiment( num_cpus_for_main_process=1, ) + # Evaluation setup. + if args.evaluation_interval > 0: + config.evaluation( + evaluation_num_env_runners=args.evaluation_num_env_runners, + evaluation_interval=args.evaluation_interval, + evaluation_duration=args.evaluation_duration, + evaluation_duration_unit=args.evaluation_duration_unit, + evaluation_parallel_to_training=args.evaluation_parallel_to_training, + ) + # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object). if args.no_tune: assert not args.as_test and not args.as_release_test