From 7728726080ed2b7a875608135ef4849c61406397 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 10 Apr 2024 22:42:19 +0200 Subject: [PATCH 01/11] wip Signed-off-by: sven1977 --- rllib/BUILD | 3 +- rllib/examples/README.rst | 0 ...raining_step_on_and_off_policy_combined.py | 179 ++++++++++-------- .../checkpoint_by_custom_criteria.py | 141 ++++++++------ .../curriculum/curriculum_learning.py | 45 +---- 5 files changed, 188 insertions(+), 180 deletions(-) create mode 100644 rllib/examples/README.rst diff --git a/rllib/BUILD b/rllib/BUILD index be75db2de236..e33b9b621d7e 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2112,14 +2112,13 @@ py_test( srcs = ["examples/checkpoints/cartpole_dqn_export.py"], ) -#@OldAPIStack py_test( name = "examples/checkpoints/checkpoint_by_custom_criteria", main = "examples/checkpoints/checkpoint_by_custom_criteria.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/checkpoints/checkpoint_by_custom_criteria.py"], - args = ["--stop-iters=3 --num-cpus=3"] + args = ["--enable-new-api-stack", "--stop-reward=150.0", "--num-cpus=9"] ) #@OldAPIStack diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 97ebf6a10d30..fd782f4828ff 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -1,31 +1,57 @@ -"""Example of using a custom training workflow. - -This example creates a number of CartPole agents, some of which are trained with -DQN, and some of which are trained with PPO. Both are executed concurrently -with a custom training workflow. +"""Example of using a custom training workflow via overriding `Algorithm.training_step`. + +This example: +- defines a new Algorithm subclass and implements its `setup` (to add a DQN replay +buffer) and `training_step` (to define a custom workflow using both on-policy and off- +policy learning). +- uses a multi-agent CartPole environment, in which N agents map to N RLModules, +some of which are trained using PPO (on-policy) and some are trained by DQN +(off-policy). +- training all RLModules (policies) is performed concurrently using the above mentioned +custom training workflow (defined in `training_step()`). + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-agents=2` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see the PPO policy ("learnable_policy") does much +better than "random": + ++-------------------+------------+----------+------+----------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|-------------------+------------+----------+------+----------------+ +| PPO_multi_agen... | TERMINATED | 127. ... | 20 | 58.646 | ++-------------------+------------+----------+------+----------------+ + ++--------+-------------------+-----------------+--------------------+ +| ts | combined reward | reward random | reward | +| | | | learnable_policy | ++--------+-------------------+-----------------+--------------------| +| 80000 | 481.26 | 78.41 | 464.41 | ++--------+-------------------+-----------------+--------------------+ """ -import argparse -import os - -import ray -from ray import air, tune from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.algorithms.dqn.dqn import DQNConfig -from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy -from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy -from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy -from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy -from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample -from ray.rllib.execution.train_ops import train_one_step -from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ( - MultiAgentReplayBuffer, -) +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole -from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples +from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.metrics import ( NUM_AGENT_STEPS_SAMPLED, @@ -33,45 +59,33 @@ NUM_TARGET_UPDATES, LAST_TARGET_UPDATE_TS, ) -from ray.rllib.utils.sgd import standardized -from ray.rllib.utils.test_utils import check_learning_achieved +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) from ray.rllib.utils.typing import ResultDict from ray.tune.registry import register_env -parser = argparse.ArgumentParser() -parser.add_argument("--torch", action="store_true") -parser.add_argument("--mixed-torch-tf", action="store_true") -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=600, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." -) -# 600.0 = 4 (num_agents) x 150.0 -parser.add_argument( - "--stop-reward", type=float, default=600.0, help="Reward at which we stop training." + +parser = add_rllib_example_script_args( + default_iters=600, default_reward=600.0, default_timesteps=200000 ) # Define new Algorithm with custom `training_step()` method (training workflow). -class MyAlgo(Algorithm): +class MyOnPolicyOffPolicyAlgo(Algorithm): @override(Algorithm) def setup(self, config): # Call super's `setup` to create rollout workers. super().setup(config) - # Create local replay buffer. - self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000) + # TODO (sven): Once we have a multi-agent capable replay buffer, use it here. + # For now, we are ok with a separate buffer per DQN agent (as we are doing + # independent learning anyways). + # Create N local replay buffers (one for each DQN agent). + self.local_replay_buffers = [ + EpisodeMultiAgentReplayBuffer(num_shards=1, capacity=50000) + for n in range(0, args.num_agents, 2) + ] @override(Algorithm) def training_step(self) -> ResultDict: @@ -152,19 +166,14 @@ def training_step(self) -> ResultDict: if __name__ == "__main__": args = parser.parse_args() - assert not ( - args.torch and args.mixed_torch_tf - ), "Use either --torch or --mixed-torch-tf, not both!" - - ray.init(local_mode=args.local_mode) # Simple environment with 4 independent cartpole entities register_env( - "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4}) + "env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg}) ) # Note that since the algorithm below does not include a default policy or - # policy configs, we have to explicitly set it in the multiagent config: + # policy configs, we have to explicitly set it in the multi_agent config: policies = { "ppo_policy": ( PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy, @@ -184,36 +193,48 @@ def training_step(self) -> ResultDict: ), } - def policy_mapping_fn(agent_id, episode, worker, **kwargs): + def agent_to_module_mapping_fn(agent_id, episode, **kwargs): if agent_id % 2 == 0: - return "ppo_policy" + return "ppo_module" else: - return "dqn_policy" + return "dqn_module" - config = ( + base_config = ( AlgorithmConfig() - # TODO (Kourosh): Migrate this to the new RLModule / Learner API. - .experimental(_enable_new_api_stack=False) .environment("multi_agent_cartpole") - .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .rollouts(num_rollout_workers=0, rollout_fragment_length=50) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + .rollouts(rollout_fragment_length=50) .reporting(metrics_num_episodes_for_smoothing=30) ) - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } + run_rllib_example_script_experiment(base_config, args) + + - results = tune.Tuner( - MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) - ).fit() - if args.as_test: - check_learning_achieved(results, args.stop_reward) + base_config = ( + PPOConfig() + .environment("multi_agent_cartpole") + .multi_agent( + policies={"learnable_policy", "random"}, + # Map to either random behavior or PPO learning behavior based on + # the agent's ID. + policy_mapping_fn=lambda agent_id, *args, **kwargs: [ + "learnable_policy", + "random", + ][agent_id % 2], + # We need to specify this here, b/c the `forward_train` method of + # `RandomRLModule` (ModuleID="random") throws a not-implemented error. + policies_to_train=["learnable_policy"], + ) + .rl_module( + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "learnable_policy": SingleAgentRLModuleSpec(), + "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), + } + ), + ) + ) - ray.shutdown() + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py index 79d2405d88da..f5d915bbb03f 100644 --- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -1,67 +1,92 @@ -# TODO (sven): Move this example script into the new API stack. +"""Example extracting a checkpoint from n trials using one or more custom criteria. -import argparse -import os +This example: +- runs a simple CartPole experiment with three different learning rates (three tune +"trials"). During the experiment, for each trial, we create a checkpoint at each +iteration. +- at the end of the experiment, we compare the trials and pick the one that performed +best, based on the criterion: Lowest episode count per single iteration (for CartPole, +a low episode count means the episodes are very long and thus the reward is also very +high). +- from that best trial (with the lowest episode count), we then pick those checkpoints +that a) have the lowest policy loss (good) and b) have the highest value function loss +(bad). -import ray -from ray import air, tune -from ray.tune.registry import get_trainable_cls -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +In the console output, you can see the performance of the three different learning +rates used here: + ++-----------------------------+------------+-----------------+--------+--------+ +| Trial name | status | loc | lr | iter | +|-----------------------------+------------+-----------------+--------+--------+ +| PPO_CartPole-v1_d7dbe_00000 | TERMINATED | 127.0.0.1:98487 | 0.01 | 17 | +| PPO_CartPole-v1_d7dbe_00001 | TERMINATED | 127.0.0.1:98488 | 0.001 | 8 | +| PPO_CartPole-v1_d7dbe_00002 | TERMINATED | 127.0.0.1:98489 | 0.0001 | 9 | ++-----------------------------+------------+-----------------+--------+--------+ + ++------------------+-------+----------+----------------------+----------------------+ +| total time (s) | ts | reward | episode_reward_max | episode_reward_min | +|------------------+-------+----------+----------------------+----------------------+ +| 28.1068 | 39797 | 151.11 | 500 | 12 | +| 13.304 | 18728 | 158.91 | 500 | 15 | +| 14.8848 | 21069 | 167.36 | 500 | 13 | ++------------------+-------+----------+----------------------+----------------------+ + ++--------------------+ +| episode_len_mean | +|--------------------| +| 151.11 | +| 158.91 | +| 167.36 | ++--------------------+ +""" + +from ray import tune +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, ) -parser.add_argument("--stop-iters", type=int, default=200) -parser.add_argument("--stop-timesteps", type=int, default=100000) -parser.add_argument("--stop-reward", type=float, default=150.0) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_reward=450.0, default_timesteps=100000, default_iters=200 ) + if __name__ == "__main__": args = parser.parse_args() - ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) + # Force-set `args.checkpoint_freq` to 1. + args.checkpoint_freq = 1 # Simple generic config. - config = ( - get_trainable_cls(args.run) + base_config = ( + get_trainable_cls(args.algo) .get_default_config() .environment("CartPole-v1") - # Run with tracing enabled for tf2. - .framework(args.framework) # Run 3 trials. .training( lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341 - ) # TEST - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ) ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - # Run tune for some iterations and generate checkpoints. - tuner = tune.Tuner( - args.run, - param_space=config.to_dict(), - run_config=air.RunConfig( - stop=stop, checkpoint_config=air.CheckpointConfig(checkpoint_frequency=1) - ), - ) - results = tuner.fit() + results = run_rllib_example_script_experiment(base_config, args) # Get the best of the 3 trials by using some metric. # NOTE: Choosing the min `episodes_this_iter` automatically picks the trial @@ -79,8 +104,8 @@ best_result = results.get_best_result(metric=metric, mode="min", scope="all") value_best_metric = best_result.metrics_dataframe[metric].min() print( - "Best trial's lowest episode length (over all " - "iterations): {}".format(value_best_metric) + f"Best trial was the one with lr={best_result.metrics['config']['lr']}. " + "Reached lowest episode count ({value_best_metric}) in a single iteration." ) # Confirm, we picked the right trial. @@ -88,19 +113,23 @@ # Get the best checkpoints from the trial, based on different metrics. # Checkpoint with the lowest policy loss value: - if config._enable_new_api_stack: + if args.enable_new_api_stack: policy_loss_key = "info/learner/default_policy/policy_loss" else: policy_loss_key = "info/learner/default_policy/learner_stats/policy_loss" - ckpt = results.get_best_result(metric=policy_loss_key, mode="min").checkpoint - print("Lowest pol-loss: {}".format(ckpt)) + best_result = results.get_best_result(metric=policy_loss_key, mode="min") + ckpt = best_result.checkpoint + lowest_policy_loss = best_result.metrics_dataframe[policy_loss_key].min() + print(f"Checkpoint w/ lowest policy loss: {ckpt}") + print(f"Lowest policy loss: {lowest_policy_loss}") # Checkpoint with the highest value-function loss: - if config._enable_new_api_stack: + if args.enable_new_api_stack: vf_loss_key = "info/learner/default_policy/vf_loss" else: vf_loss_key = "info/learner/default_policy/learner_stats/vf_loss" - ckpt = results.get_best_result(metric=vf_loss_key, mode="max").checkpoint - print("Highest vf-loss: {}".format(ckpt)) - - ray.shutdown() + best_result = results.get_best_result(metric=vf_loss_key, mode="max") + ckpt = best_result.checkpoint + highest_value_fn_loss = best_result.metrics_dataframe[vf_loss_key].max() + print(f"Checkpoint w/ highest value function loss: {ckpt}") + print(f"Highest value function loss: {highest_value_fn_loss}") diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 5ce3b09ffd8d..f12be5ed2ea1 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -29,21 +29,6 @@ torch, nn = try_import_torch() parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) parser.add_argument( "--stop-iters", type=int, default=50, help="Number of iterations to train." ) @@ -56,11 +41,6 @@ default=10000.0, help="Reward at which we stop training.", ) -parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", -) def curriculum_fn( @@ -99,7 +79,6 @@ def curriculum_fn( if __name__ == "__main__": args = parser.parse_args() - ray.init(local_mode=args.local_mode) # Can also register the env creator function explicitly with: # register_env( @@ -113,29 +92,9 @@ def curriculum_fn( CurriculumCapableEnv, env_config={"start_level": 1}, # TODO (sven): Replace this API with a simple custom callback - # (override `on_train_results` in which we simply set each EnvRunners' - # env to the new stage). + # (override `on_train_results` in which we set each EnvRunners' env to + # the new stage). env_task_fn=curriculum_fn, ) - .framework(args.framework) .rollouts(num_rollout_workers=2, num_envs_per_worker=5) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) - - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - tuner = tune.Tuner( - args.run, - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop, verbose=2), ) - results = tuner.fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() From ac1ba1084a362c8e1a0d790f3c77e877c5c145f0 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 12 Apr 2024 16:16:47 +0200 Subject: [PATCH 02/11] wip Signed-off-by: sven1977 --- rllib/BUILD | 3 +- rllib/algorithms/algorithm_config.py | 6 +- ...raining_step_on_and_off_policy_combined.py | 7 +- .../checkpoint_by_custom_criteria.py | 4 +- .../curriculum/curriculum_learning.py | 290 +++++++++++++----- .../examples/evaluation/custom_evaluation.py | 4 +- ...ock_paper_scissors_heuristic_vs_learned.py | 2 +- rllib/utils/test_utils.py | 14 +- 8 files changed, 230 insertions(+), 100 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 633e879d75a0..c821f418ebb4 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2251,14 +2251,13 @@ py_test( # subdirectory: curriculum/ # .................................... -#@OldAPIStack py_test( name = "examples/curriculum/curriculum_learning", main = "examples/curriculum/curriculum_learning.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/curriculum/curriculum_learning.py"], - args = ["--as-test", "--stop-reward=800.0"] + args = ["--enable-new-api-stack", "--as-test"] ) # subdirectory: debugging/ diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index fa87ab0660c9..fa144251959b 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1468,11 +1468,7 @@ def environment( if env is not NotProvided: self.env = env if env_config is not NotProvided: - deep_update( - self.env_config, - env_config, - True, - ) + deep_update(self.env_config, env_config, True) if observation_space is not NotProvided: self.observation_space = observation_space if action_space is not NotProvided: diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index fd782f4828ff..c847bbc65a98 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -168,9 +168,7 @@ def training_step(self) -> ResultDict: args = parser.parse_args() # Simple environment with 4 independent cartpole entities - register_env( - "env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg}) - ) + register_env("env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg})) # Note that since the algorithm below does not include a default policy or # policy configs, we have to explicitly set it in the multi_agent config: @@ -209,9 +207,6 @@ def agent_to_module_mapping_fn(agent_id, episode, **kwargs): run_rllib_example_script_experiment(base_config, args) - - - base_config = ( PPOConfig() .environment("multi_agent_cartpole") diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py index f5d915bbb03f..d48ec05965fe 100644 --- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -81,9 +81,7 @@ .get_default_config() .environment("CartPole-v1") # Run 3 trials. - .training( - lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341 - ) + .training(lr=tune.grid_search([0.01, 0.001, 0.0001]), train_batch_size=2341) ) # Run tune for some iterations and generate checkpoints. results = run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index f12be5ed2ea1..48425bb6cede 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -1,100 +1,236 @@ -# TODO (sven): Move this example script into the new API stack. +"""Example of using an env-task curriculum via implementing a custom callback. + +This example: + - demonstrates how to define your own curriculum-capable environments using + gymnasium's FrozenLake env. + - defines a custom callback that gets called once per iteration and - if necessary - + changes the maps used by FrozenLake on all EnvRunners to a new task (by moving the + goal position further and further away from the starting position). + - also demonstrates an alternative approach via reloading/recreating an entirely new + env inside all EnvRunners. + - uses Tune and RLlib to curriculum-learn the env described above and compares 2 + algorithms, one that does use curriculum learning vs one that does not. + +We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step +limit of 16 to make it almost impossible for a non-curriculum policy to learn. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--no-curriculum` flag to disable curriculum learning and force your policy +to be trained on the hardest task right away. With this option, the algorithm should NOT +succeed. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` -""" -Example of a curriculum learning setup using the `TaskSettableEnv` API -and the env_task_fn config. -This example shows: - - Writing your own curriculum-capable environment using gym.Env. - - Defining a env_task_fn that determines, whether and which new task - the env(s) should be set to (using the TaskSettableEnv API). - - Using Tune and RLlib to curriculum-learn this env. +Results to expect +----------------- +In the console output, you can see that only PPO policy that uses a curriculum can +actually learn, whereas the one that is thrown into the toughest task right from the +start never learns anything. + +Policy using the curriculum: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|-------------------------------+------------+-----------------+--------+ +| PPO_FrozenLake-v1_93ca4_00000 | TERMINATED | 127.0.0.1:73318 | 41 | ++-------------------------------+------------+-----------------+--------+ ++------------------+--------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+--------+----------+--------------------| +| 97.652 | 164000 | 1 | 14.0348 | ++------------------+--------+----------+--------------------+ + +Policy NOT using the curriculum (trying to solve the hardest task right away): + -You can visualize experiment results in ~/ray_results using TensorBoard. """ -import argparse -import numpy as np -import os - -import ray -from ray import air, tune -from ray.rllib.env.apis.task_settable_env import TaskSettableEnv, TaskType -from ray.rllib.env.env_context import EnvContext -from ray.rllib.examples.envs.classes.curriculum_capable_env import CurriculumCapableEnv -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import get_trainable_cls +from functools import partial -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.callbacks import DefaultCallbacks +from ray.rllib.connectors.env_to_module import ( + AddObservationsFromEpisodesToBatch, + FlattenObservations, + WriteObservationsToEpisodes, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls -parser = argparse.ArgumentParser() +parser = add_rllib_example_script_args(default_iters=100, default_timesteps=600000) parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." + "--recreate-entire-gym-env", + action="store_true", + help="Whether to recreate the entire gym.vector.Env on each EnvRunner whenever a " + "new task/map should be loaded.", ) parser.add_argument( - "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." + "--upgrade-task-threshold", + type=float, + default=0.99, + help="The mean episode return, upon reaching of which we increase the task by one.", ) parser.add_argument( - "--stop-reward", - type=float, - default=10000.0, - help="Reward at which we stop training.", + "--no-curriculum", + action="store_true", + help="Whether to NOT use curriculum learning (and instead trying to solve the " + "hardest task right away).", ) -def curriculum_fn( - train_results: dict, task_settable_env: TaskSettableEnv, env_ctx: EnvContext -) -> TaskType: - """Function returning a possibly new task to set `task_settable_env` to. - - Args: - train_results: The train results returned by Algorithm.train(). - task_settable_env: A single TaskSettableEnv object - used inside any worker and at any vector position. Use `env_ctx` - to get the worker_index, vector_index, and num_workers. - env_ctx: The env context object (i.e. env's config dict - plus properties worker_index, vector_index and num_workers) used - to setup the `task_settable_env`. - - Returns: - TaskType: The task to set the env to. This may be the same as the - current one. - """ - # Our env supports tasks 1 (default) to 5. - # With each task, rewards get scaled up by a factor of 10, such that: - # Level 1: Expect rewards between 0.0 and 1.0. - # Level 2: Expect rewards between 1.0 and 10.0, etc.. - # We will thus raise the level/task each time we hit a new power of 10.0 - new_task = int(np.log10(train_results["episode_reward_mean"]) + 2.1) - # Clamp between valid values, just in case: - new_task = max(min(new_task, 5), 1) - print( - f"Worker #{env_ctx.worker_index} vec-idx={env_ctx.vector_index}" - f"\nR={train_results['episode_reward_mean']}" - f"\nSetting env to task={new_task}" - ) - return new_task +ENV_OPTIONS = { + "is_slippery": False, + # Limit the number of steps the agent is allowed to make in the env to + # make it almost impossible to learn without the curriculum. + "max_episode_steps": 16, +} + +# Our 3 tasks: 0=easiest, 1=medium, 2=hard +ENV_MAPS = [ + # 0 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFGFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFFFHF", + "FFFFFHHF", + "FHFFFFFF", + ], + # 1 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFFFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFGFHF", + "FFFFFHHF", + "FHFFFFFF", + ], + # 2 + [ + "SFFHFFFH", + "FFFHFFFF", + "FFFFFFFF", + "FFFFFFFF", + "HFFFFFFF", + "HHFFFFHF", + "FFFFFHHF", + "FHFFFFFG", + ], +] + + +# Simple function sent to an EnvRunner to change the map of all its gym.Envs from +# the current one to a new (tougher) one, in which the goal position is further away +# from the starting position. Note that a map is a list of strings, each one +# representing one row in the map. Each character in the strings represent a single +# field (S=starting position, H=hole (bad), F=frozen/free field (ok), G=goal (great!)). +def _remote_fn(env_runner, new_task: int): + # We recreate the entire env object and re-assign it to the EnvRunner. + if True: # args.recreate_entire_gym_env: + # A SingleAgentEnvRunner holds a gym.vector.Env under `self.env`. + env_runner.config.environment(env_config={"desc": ENV_MAPS[new_task]}) + env_runner._make_env() + # Set the EnvRunners `_needs_initial_reset` property (to make sure we reset the + # next time we call `sample()` on the EnvRunner. + env_runner._needs_initial_reset = True + # We change each vectorized Env's map to the provided one. + else: + raise NotImplementedError + + +class CallbackThatSetsNewEnvTask(DefaultCallbacks): + """Custom callback implementing `on_train_result()` for changing the envs' maps.""" + + def on_train_result( + self, + *, + algorithm: Algorithm, + result: dict, + **kwargs, + ) -> None: + # Hack: Store the current task inside a counter in our Algorithm. + # W/o a curriculum, the task is always 2 (hardest). + if args.no_curriculum: + algorithm._counters["current_env_task"] = 2 + current_task = algorithm._counters["current_env_task"] + + # If episode return is consistently 1.0, we switch to a more difficult task + # (if possible). If we already mastered the most difficult task, we publish + # our victory in the result dict. + result["task_solved"] = 0.0 + current_return = result["sampler_results"]["episode_reward_mean"] + if current_return > args.upgrade_task_threshold: + if current_task < 2: + new_task = current_task + 1 + print( + f"Switching task/map on all EnvRunners to #{new_task} (0=easiest, " + f"2=hardest), b/c R={current_return} on current task." + ) + algorithm.workers.foreach_worker( + func=partial(_remote_fn, new_task=new_task) + ) + algorithm._counters["current_env_task"] = new_task + + # Hardest task was solved (1.0) -> report this in the results dict. + elif current_return == 1.0: + result["task_solved"] = 1.0 if __name__ == "__main__": args = parser.parse_args() - # Can also register the env creator function explicitly with: - # register_env( - # "curriculum_env", lambda config: CurriculumCapableEnv(config)) - - config = ( - get_trainable_cls(args.run) + base_config = ( + get_trainable_cls(args.algo) .get_default_config() - # or "curriculum_env" if registered above + # Plug in our curriculum callbacks that controls when we should upgrade the env + # task based on the received return for the current task. + .callbacks(CallbackThatSetsNewEnvTask) .environment( - CurriculumCapableEnv, - env_config={"start_level": 1}, - # TODO (sven): Replace this API with a simple custom callback - # (override `on_train_results` in which we set each EnvRunners' env to - # the new stage). - env_task_fn=curriculum_fn, + "FrozenLake-v1", + env_config={ + # w/ curriculum: start with task=0 (easiest) + # w/o curriculum: start directly with hardest task 2. + "desc": ENV_MAPS[2 if args.no_curriculum else 0], + **ENV_OPTIONS, + }, + ) + .rollouts( + num_envs_per_worker=5, + env_to_module_connector=lambda env: [ + AddObservationsFromEpisodesToBatch(), + FlattenObservations(), + WriteObservationsToEpisodes(), + ], ) - .rollouts(num_rollout_workers=2, num_envs_per_worker=5) + ) + + stop = { + "training_iteration": args.stop_iters, + # Reward directly does not matter to us as we would like to continue + # after the 0-task (easiest) reaches a return of 1.0. But we DO want to + # stop, once the entire task is done (reaches return of 1.0 in the most + # difficult task=2). + "task_solved": 1.0, + "timesteps_total": args.stop_timesteps, + } + + run_rllib_example_script_experiment( + base_config, args, stop=stop, success_metric={"task_solved": 1.0} ) diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index b8e15c7739e0..7902ade2b520 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -140,5 +140,7 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul base_config, args, stop=stop, - success_metric="evaluation/sampler_results/episode_reward_mean", + success_metric={ + "evaluation/sampler_results/episode_reward_mean": args.stop_reward, + }, ) diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py index 19a7a2edb529..49c23c02b584 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -143,5 +143,5 @@ base_config, args, stop=stop, - success_metric="sampler_results/policy_reward_mean/learned", + success_metric={"sampler_results/policy_reward_mean/learned": args.stop_reward}, ) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 216ecb77d752..ac55c29218f5 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1199,7 +1199,7 @@ def run_rllib_example_script_experiment( args: argparse.Namespace, *, stop: Optional[Dict] = None, - success_metric: str = "sampler_results/episode_reward_mean", + success_metric: Optional[Dict] = None, ) -> Union[ResultDict, tune.result_grid.ResultGrid]: """Given an algorithm config and some command line args, runs an experiment. @@ -1261,7 +1261,7 @@ def run_rllib_example_script_experiment( if args.no_tune: algo = config.build() - for iter in range(args.stop_iters): + for _ in range(args.stop_iters): results = algo.train() print(f"R={results['sampler_results']['episode_reward_mean']}", end="") if "evaluation" in results: @@ -1334,10 +1334,14 @@ def run_rllib_example_script_experiment( ).fit() if args.as_test: + if success_metric is None: + success_metric = {"sampler_results/episode_reward_mean": args.stop_reward} + # TODO (sven): Make this work for more than one metric (AND-logic?). + metric = next(iter(success_metric.keys())) check_learning_achieved( - results, - args.stop_reward, - metric=success_metric, + tune_results=results, + min_value=success_metric[metric], + metric=metric, ) return results From 44e436bf0323c4fac1f4f92ce63e14bb7700c174 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 12 Apr 2024 16:18:52 +0200 Subject: [PATCH 03/11] wip Signed-off-by: sven1977 --- rllib/examples/README.rst | 0 ...raining_step_on_and_off_policy_combined.py | 180 ++++++++---------- 2 files changed, 82 insertions(+), 98 deletions(-) delete mode 100644 rllib/examples/README.rst diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index c847bbc65a98..97ebf6a10d30 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -1,57 +1,31 @@ -"""Example of using a custom training workflow via overriding `Algorithm.training_step`. - -This example: -- defines a new Algorithm subclass and implements its `setup` (to add a DQN replay -buffer) and `training_step` (to define a custom workflow using both on-policy and off- -policy learning). -- uses a multi-agent CartPole environment, in which N agents map to N RLModules, -some of which are trained using PPO (on-policy) and some are trained by DQN -(off-policy). -- training all RLModules (policies) is performed concurrently using the above mentioned -custom training workflow (defined in `training_step()`). - - -How to run this script ----------------------- -`python [script file name].py --enable-new-api-stack --num-agents=2` - -For debugging, use the following additional command line options -`--no-tune --num-env-runners=0` -which should allow you to set breakpoints anywhere in the RLlib code and -have the execution stop there for inspection and debugging. - -For logging to your WandB account, use: -`--wandb-key=[your WandB API key] --wandb-project=[some project name] ---wandb-run-name=[optional: WandB run name (within the defined project)]` - - -Results to expect ------------------ -In the console output, you can see the PPO policy ("learnable_policy") does much -better than "random": - -+-------------------+------------+----------+------+----------------+ -| Trial name | status | loc | iter | total time (s) | -| | | | | | -|-------------------+------------+----------+------+----------------+ -| PPO_multi_agen... | TERMINATED | 127. ... | 20 | 58.646 | -+-------------------+------------+----------+------+----------------+ - -+--------+-------------------+-----------------+--------------------+ -| ts | combined reward | reward random | reward | -| | | | learnable_policy | -+--------+-------------------+-----------------+--------------------| -| 80000 | 481.26 | 78.41 | 464.41 | -+--------+-------------------+-----------------+--------------------+ +"""Example of using a custom training workflow. + +This example creates a number of CartPole agents, some of which are trained with +DQN, and some of which are trained with PPO. Both are executed concurrently +with a custom training workflow. """ +import argparse +import os + +import ray +from ray import air, tune from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec +from ray.rllib.algorithms.dqn.dqn import DQNConfig +from ray.rllib.algorithms.dqn.dqn_tf_policy import DQNTFPolicy +from ray.rllib.algorithms.dqn.dqn_torch_policy import DQNTorchPolicy +from ray.rllib.algorithms.ppo.ppo import PPOConfig +from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy +from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.execution.rollout_ops import synchronous_parallel_sample +from ray.rllib.execution.train_ops import train_one_step +from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ( + MultiAgentReplayBuffer, +) from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole -from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule +from ray.rllib.policy.sample_batch import MultiAgentBatch, concat_samples from ray.rllib.utils.annotations import override from ray.rllib.utils.metrics import ( NUM_AGENT_STEPS_SAMPLED, @@ -59,33 +33,45 @@ NUM_TARGET_UPDATES, LAST_TARGET_UPDATE_TS, ) -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) +from ray.rllib.utils.sgd import standardized +from ray.rllib.utils.test_utils import check_learning_achieved from ray.rllib.utils.typing import ResultDict from ray.tune.registry import register_env - -parser = add_rllib_example_script_args( - default_iters=600, default_reward=600.0, default_timesteps=200000 +parser = argparse.ArgumentParser() +parser.add_argument("--torch", action="store_true") +parser.add_argument("--mixed-torch-tf", action="store_true") +parser.add_argument( + "--local-mode", + action="store_true", + help="Init Ray in local mode for easier debugging.", +) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=600, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=200000, help="Number of timesteps to train." +) +# 600.0 = 4 (num_agents) x 150.0 +parser.add_argument( + "--stop-reward", type=float, default=600.0, help="Reward at which we stop training." ) # Define new Algorithm with custom `training_step()` method (training workflow). -class MyOnPolicyOffPolicyAlgo(Algorithm): +class MyAlgo(Algorithm): @override(Algorithm) def setup(self, config): # Call super's `setup` to create rollout workers. super().setup(config) - # TODO (sven): Once we have a multi-agent capable replay buffer, use it here. - # For now, we are ok with a separate buffer per DQN agent (as we are doing - # independent learning anyways). - # Create N local replay buffers (one for each DQN agent). - self.local_replay_buffers = [ - EpisodeMultiAgentReplayBuffer(num_shards=1, capacity=50000) - for n in range(0, args.num_agents, 2) - ] + # Create local replay buffer. + self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, capacity=50000) @override(Algorithm) def training_step(self) -> ResultDict: @@ -166,12 +152,19 @@ def training_step(self) -> ResultDict: if __name__ == "__main__": args = parser.parse_args() + assert not ( + args.torch and args.mixed_torch_tf + ), "Use either --torch or --mixed-torch-tf, not both!" + + ray.init(local_mode=args.local_mode) # Simple environment with 4 independent cartpole entities - register_env("env", lambda cfg: MultiAgentCartPole({"num_agents": 4, **cfg})) + register_env( + "multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 4}) + ) # Note that since the algorithm below does not include a default policy or - # policy configs, we have to explicitly set it in the multi_agent config: + # policy configs, we have to explicitly set it in the multiagent config: policies = { "ppo_policy": ( PPOTorchPolicy if args.torch or args.mixed_torch_tf else PPOTF1Policy, @@ -191,45 +184,36 @@ def training_step(self) -> ResultDict: ), } - def agent_to_module_mapping_fn(agent_id, episode, **kwargs): + def policy_mapping_fn(agent_id, episode, worker, **kwargs): if agent_id % 2 == 0: - return "ppo_module" + return "ppo_policy" else: - return "dqn_module" + return "dqn_policy" - base_config = ( + config = ( AlgorithmConfig() + # TODO (Kourosh): Migrate this to the new RLModule / Learner API. + .experimental(_enable_new_api_stack=False) .environment("multi_agent_cartpole") + .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .rollouts(rollout_fragment_length=50) + .rollouts(num_rollout_workers=0, rollout_fragment_length=50) + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .reporting(metrics_num_episodes_for_smoothing=30) ) - run_rllib_example_script_experiment(base_config, args) + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } - base_config = ( - PPOConfig() - .environment("multi_agent_cartpole") - .multi_agent( - policies={"learnable_policy", "random"}, - # Map to either random behavior or PPO learning behavior based on - # the agent's ID. - policy_mapping_fn=lambda agent_id, *args, **kwargs: [ - "learnable_policy", - "random", - ][agent_id % 2], - # We need to specify this here, b/c the `forward_train` method of - # `RandomRLModule` (ModuleID="random") throws a not-implemented error. - policies_to_train=["learnable_policy"], - ) - .rl_module( - rl_module_spec=MultiAgentRLModuleSpec( - module_specs={ - "learnable_policy": SingleAgentRLModuleSpec(), - "random": SingleAgentRLModuleSpec(module_class=RandomRLModule), - } - ), - ) - ) + results = tune.Tuner( + MyAlgo, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) - run_rllib_example_script_experiment(base_config, args) + ray.shutdown() From 2cffca9672f4f84dd1e573d91e474f295144fba9 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 12 Apr 2024 17:24:23 +0200 Subject: [PATCH 04/11] wip Signed-off-by: sven1977 --- rllib/BUILD | 15 +- .../curriculum/curriculum_learning.py | 3 +- rllib/examples/envs/custom_gym_env.py | 231 +++++++++--------- rllib/utils/test_utils.py | 63 ++++- 4 files changed, 168 insertions(+), 144 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index c821f418ebb4..ba1580c21b9b 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2276,24 +2276,13 @@ py_test( # subdirectory: envs/ # .................................... -#@OldAPIStack -py_test( - name = "examples/envs/custom_gym_env_tf", - main = "examples/envs/custom_gym_env.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/envs/custom_gym_env.py"], - args = ["--as-test", "--framework=tf"] -) - -#@OldAPIStack py_test( - name = "examples/envs/custom_gym_env_torch", + name = "examples/envs/custom_gym_env", main = "examples/envs/custom_gym_env.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/envs/custom_gym_env.py"], - args = ["--as-test", "--framework=torch"] + args = ["--enable-new-api-stack", "--as-test"] ) #@OldAPIStack diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 48425bb6cede..921161bf194d 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -52,8 +52,7 @@ +------------------+--------+----------+--------------------+ Policy NOT using the curriculum (trying to solve the hardest task right away): - - +[DOES NOT LEARN AT ALL] """ from functools import partial diff --git a/rllib/examples/envs/custom_gym_env.py b/rllib/examples/envs/custom_gym_env.py index 95cdbcee81d5..616e3cee15b9 100644 --- a/rllib/examples/envs/custom_gym_env.py +++ b/rllib/examples/envs/custom_gym_env.py @@ -1,167 +1,154 @@ -# TODO (sven): Move this example script into the new API stack. +"""Example of defining a custom gymnasium Env to be learned by an RLlib Algorithm. -""" -Example of a custom gym environment. Run this example for a demo. +This example: + - demonstrates how to write your own (single-agent) gymnasium Env class, define its + physics and mechanics, the reward function used, the allowed actions (action space), + and the type of observations (observation space), etc.. + - shows how to configure and setup this environment class within an RLlib + Algorithm config. + - runs the experiment with the configured algo, trying to solve the environment. + +To see more details on which env we are building for this example, take a look at the +`SimpleCorridor` class defined below. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--corridor-length` option to set a custom length for the corridor. Note that +for extremely long corridors, the algorithm should take longer to learn. -This example shows the usage of: - - a custom environment - - Ray Tune for grid search to try different learning rates +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. -You can visualize experiment results in ~/ray_results using TensorBoard. +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` -Run example with defaults: -$ python custom_env.py -For CLI options: -$ python custom_env.py --help + +Results to expect +----------------- +You should see results similar to the following in your console output: + ++--------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|--------------------------------+------------+-----------------+--------+ +| PPO_SimpleCorridor_78714_00000 | TERMINATED | 127.0.0.1:85794 | 7 | ++--------------------------------+------------+-----------------+--------+ + ++------------------+-------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+-------+----------+--------------------| +| 18.3034 | 28000 | 0.908918 | 12.9676 | ++------------------+-------+----------+--------------------+ """ -import argparse import gymnasium as gym from gymnasium.spaces import Discrete, Box import numpy as np -import os import random -import ray -from ray import air, tune -from ray.rllib.env.env_context import EnvContext -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.logger import pretty_print -from ray.tune.registry import get_trainable_cls - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() +from typing import Optional -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, ) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=0.1, help="Reward at which we stop training." -) -parser.add_argument( - "--no-tune", - action="store_true", - help="Run without Tune using a manual train loop instead. In this case," - "use PPO without grid search and no TensorBoard.", +from ray.tune.registry import get_trainable_cls, register_env # noqa + + +parser = add_rllib_example_script_args( + default_reward=0.9, default_iters=50, default_timesteps=100000 ) parser.add_argument( - "--local-mode", - action="store_true", - help="Init Ray in local mode for easier debugging.", + "--corridor-length", + type=int, + default=10, + help="The length of the corridor in fields. Note that this number includes the " + "starting- and goal states.", ) class SimpleCorridor(gym.Env): - """Example of a custom env in which you have to walk down a corridor. - - You can configure the length of the corridor via the env config.""" - - def __init__(self, config: EnvContext): - self.end_pos = config["corridor_length"] + """Example of a custom env in which the agent has to walk down a corridor. + + ------------ + |S........G| + ------------ + , where S is the starting position, G is the goal position, and fields with '.' + mark free spaces, over which the agent may step. The length of the above example + corridor is 10. + Allowed actions are left (0) and right (1). + The reward function is -0.01 per step taken and a uniform random value between + 0.5 and 1.5 when reaching the goal state. + + You can configure the length of the corridor via the env's config. Thus, in your + AlgorithmConfig, you can do: + `config.environment(env_config={"corridor_length": ..})`. + """ + + def __init__(self, config: Optional[dict] = None): + config = config or {} + self.end_pos = config.get("corridor_length", 7) self.cur_pos = 0 self.action_space = Discrete(2) self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32) - # Set the seed. This is only used for the final (reach goal) reward. - self.reset(seed=config.worker_index * config.num_workers) def reset(self, *, seed=None, options=None): random.seed(seed) self.cur_pos = 0 - return [self.cur_pos], {} + # Return obs and (empty) info dict. + return np.array([self.cur_pos], np.float32), {} def step(self, action): assert action in [0, 1], action + # Move left. if action == 0 and self.cur_pos > 0: self.cur_pos -= 1 + # Move right. elif action == 1: self.cur_pos += 1 - done = truncated = self.cur_pos >= self.end_pos - # Produce a random reward when we reach the goal. + + # The environment only ever terminates when we reach the goal state. + terminated = self.cur_pos >= self.end_pos + truncated = False + # Produce a random reward from [0.5, 1.5] when we reach the goal. + reward = random.uniform(0.5, 1.5) if terminated else -0.01 + infos = {} return ( - [self.cur_pos], - random.random() * 2 if done else -0.1, - done, + np.array([self.cur_pos], np.float32), + reward, + terminated, truncated, - {}, + infos, ) if __name__ == "__main__": args = parser.parse_args() - print(f"Running with following CLI options: {args}") - - ray.init(local_mode=args.local_mode) # Can also register the env creator function explicitly with: - # register_env("corridor", lambda config: SimpleCorridor(config)) + # register_env("corridor-env", lambda config: SimpleCorridor()) - config = ( - get_trainable_cls(args.run) - .get_default_config() - # or "corridor" if registered above - .environment(SimpleCorridor, env_config={"corridor_length": 5}) - .framework(args.framework) - .rollouts(num_rollout_workers=1) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - ) + # Or you can hard code certain settings into the Env's constructor (`config`). + # register_env( + # "corridor-env-w-len-100", + # lambda config: SimpleCorridor({**config, **{"corridor_length": 100}}), + # ) - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } - - if args.no_tune: - # manual training with train loop using PPO and fixed learning rate - if args.run != "PPO": - raise ValueError("Only support --run PPO with --no-tune.") - print("Running manual train loop without Ray Tune.") - # use fixed learning rate instead of grid search (needs tune) - config.lr = 1e-3 - algo = config.build() - # run manual training loop and print results after each iteration - for _ in range(args.stop_iters): - result = algo.train() - print(pretty_print(result)) - # stop training of the target train steps or reward are reached - if ( - result["timesteps_total"] >= args.stop_timesteps - or result["episode_reward_mean"] >= args.stop_reward - ): - break - algo.stop() - else: - # automated run with Tune and grid search and TensorBoard - print("Training automatically with Ray Tune") - tuner = tune.Tuner( - args.run, - param_space=config.to_dict(), - run_config=air.RunConfig(stop=stop), - ) - results = tuner.fit() + # Or allow the RLlib user to set more c'tor options via their algo config: + # config.environment(env_config={[c'tor arg name]: [value]}) + # register_env("corridor-env", lambda config: SimpleCorridor(config)) - if args.as_test: - print("Checking if learning goals were achieved") - check_learning_achieved(results, args.stop_reward) + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + SimpleCorridor, # or provide the registered string: "corridor-env" + env_config={"corridor_length": args.corridor_length}, + ) + ) - ray.shutdown() + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index ac55c29218f5..ecd9da2f896e 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1194,6 +1194,11 @@ def should_check_eval(experiment): return result +# TODO (sven): Make this the de-facto, well documented, and unified utility for most of +# our tests: +# - CI (label: "learning_tests") +# - release tests (benchmarks) +# - example scripts def run_rllib_example_script_experiment( base_config: "AlgorithmConfig", args: argparse.Namespace, @@ -1204,24 +1209,57 @@ def run_rllib_example_script_experiment( """Given an algorithm config and some command line args, runs an experiment. There are some constraints on what properties must be defined in `args`. - It should ideally be generated via the `` + It should ideally be generated via calling + `args = add_rllib_example_script_args()`, which can be found in this very module + here. + + The function sets up an Algorithm object from the given config (altered by the + contents of `args`), then runs the Algorithm via Tune (or manually, if + `args.no_tune` is set to True) using the stopping criteria in `stop`. + + At the end of the experiment, if `args.as_test` is True, checks, whether the + Algorithm reached the `success_metric` (if None, use + `sampler_results/episode_reward_mean` with a minimum value of `args.stop_reward`). + + See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview + of all supported command line options. Args: base_config: The AlgorithmConfig object to use for this experiment. This base config will be automatically "extended" based on some of the provided `args`. For example, `args.num_env_runners` is used to set `config.num_rollout_workers`, etc.. - args: A argparse.Namespace object which must have the following properties - defined: `stop_iters`, `stop_reward`, `stop_timesteps`, `no_tune`, - `verbose`, `checkpoint_freq`, `as_test`. Optionally, for wandb logging: - `wandb_key`, `wandb_project`, `wandb_run_name`. + args: A argparse.Namespace object, ideally returned by calling + `args = add_rllib_example_script_args()`. It must have the following + properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`, + `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB + logging: `wandb_key`, `wandb_project`, `wandb_run_name`. + stop: An optional dict mapping ResultDict key strings (using "/" in case of + nesting, e.g. "sampler_results/episode_reward_mean" for referring to + `result_dict['sampler_results']['episode_reward_mean']` to minimum + values, reaching of which will stop the experiment). Default is: + { + "sampler_results/episode_reward_mean": args.stop_reward, + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + } + success_metric: Only relevent if `args.as_test` is True. + A dict mapping a single(!) ResultDict key string (using "/" in + case of nesting, e.g. "sampler_results/episode_reward_mean" for referring + to `result_dict['sampler_results']['episode_reward_mean']` to minimum + values, reaching of which will stop the experiment) to a single(!) minimum + value to be reached in order for the experiment to count as successful. + If `args.as_test` is True AND this `success_metric` is not reached with the + bounds defined by `stop`, will raise an Exception. Returns: The last ResultDict from a --no-tune run OR the tune.Tuner.fit() results. """ + # Initialize Ray. ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) + # Define one or more stopping criteria. stop = stop or { "training_iteration": args.stop_iters, "sampler_results/episode_reward_mean": args.stop_reward, @@ -1231,10 +1269,13 @@ def run_rllib_example_script_experiment( from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner - # Extend the `base_config` based on provided `args`. + # Enhance the `base_config`, based on provided `args`. config = ( + # Set the framework. base_config.framework(args.framework) + # Enable the new API stack? .experimental(_enable_new_api_stack=args.enable_new_api_stack) + # Define EnvRunner/RolloutWorker scaling and behavior. .rollouts( num_rollout_workers=args.num_env_runners, # Set up the correct env-runner to use depending on @@ -1249,6 +1290,7 @@ def run_rllib_example_script_experiment( ) ), ) + # Define compute resources used. .resources( # Old stack. num_gpus=0 if args.enable_new_api_stack else args.num_gpus, @@ -1259,6 +1301,7 @@ def run_rllib_example_script_experiment( ) ) + # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object). if args.no_tune: algo = config.build() for _ in range(args.stop_iters): @@ -1281,6 +1324,9 @@ def run_rllib_example_script_experiment( return results return results + # Run the experiment using Ray Tune. + + # Log results using WandB. callbacks = None if hasattr(args, "wandb_key") and args.wandb_key is not None: project = args.wandb_project or ( @@ -1295,8 +1341,9 @@ def run_rllib_example_script_experiment( ) ] - progress_reporter = None + # Auto-configure a CLIReporter (to log the results to the console). # Use better ProgressReporter for multi-agent cases: List individual policy rewards. + progress_reporter = None if args.num_agents > 0: progress_reporter = CLIReporter( metric_columns={ @@ -1317,6 +1364,7 @@ def run_rllib_example_script_experiment( # `CLIReporter`. os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + # Run the actual experiment (using Tune). results = tune.Tuner( config.algo_class, param_space=config, @@ -1333,6 +1381,7 @@ def run_rllib_example_script_experiment( tune_config=tune.TuneConfig(num_samples=args.num_samples), ).fit() + # If run as a test, check whether we reached the specified success criteria. if args.as_test: if success_metric is None: success_metric = {"sampler_results/episode_reward_mean": args.stop_reward} From d741906f636bdb3a60d25b804d4ac28ec97208c6 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 13 Apr 2024 19:01:27 +0200 Subject: [PATCH 05/11] wip Signed-off-by: sven1977 --- rllib/examples/ray_tune/custom_experiment.py | 25 +++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 9c42937dc0c6..1f326d3c002a 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -1,6 +1,18 @@ -# TODO (sven): Move this example script into the new API stack. +"""Example of a custom Ray Tune experiment wrapping an RLlib Algorithm. + +You should only use such a customized workflow if the following conditions apply: +- You know exactly what you are doing :) +- Simply configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig +is not enough and doesn't allow you to shape the Algorithm into behaving the way you'd +like. +-- Note that for complex and custom evaluation procedures there is a RLlib Algorithm +config option (see examples/evaluation/custom_evaluation.py for more details). +- Subclassing + +""" + +TODO: Continue docstring above -"""Example of a custom experiment wrapped around an RLlib Algorithm.""" import argparse import ray @@ -48,11 +60,12 @@ def experiment(config): args = parser.parse_args() ray.init(num_cpus=3) - config = ppo.PPOConfig().environment("CartPole-v1") - config = config.to_dict() - config["train-iterations"] = args.train_iterations + base_config = ( + ) + base_config = base_config.to_dict() + base_config["train-iterations"] = args.train_iterations tune.Tuner( - tune.with_resources(experiment, ppo.PPO.default_resource_request(config)), + tune.with_resources(experiment, ppo.PPO.default_resource_request(base_config)), param_space=config, ).fit() From 403c3afa521232aafff7de23c73c054b3da93095 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 09:48:59 +0200 Subject: [PATCH 06/11] wip Signed-off-by: sven1977 --- rllib/BUILD | 12 +- rllib/algorithms/algorithm_config.py | 11 ++ rllib/examples/ray_tune/custom_experiment.py | 93 ++++++++---- rllib/examples/ray_tune/custom_logger.py | 142 +++++++++--------- .../ray_tune/custom_progress_reporter.py | 115 ++++++++++++++ .../ray_tune/custom_train_function.py | 69 --------- rllib/utils/test_utils.py | 20 ++- 7 files changed, 276 insertions(+), 186 deletions(-) create mode 100644 rllib/examples/ray_tune/custom_progress_reporter.py delete mode 100644 rllib/examples/ray_tune/custom_train_function.py diff --git a/rllib/BUILD b/rllib/BUILD index ba1580c21b9b..82d844445c3d 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2834,7 +2834,6 @@ py_test( # subdirectory: ray_tune/ # .................................... -#@OldAPIStack py_test( name = "examples/ray_tune/custom_experiment", main = "examples/ray_tune/custom_experiment.py", @@ -2844,23 +2843,20 @@ py_test( args = ["--train-iterations=10"] ) -#@OldAPIStack py_test( name = "examples/ray_tune/custom_logger", main = "examples/ray_tune/custom_logger.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/ray_tune/custom_logger.py"], - args = ["--stop-iters=3"] ) -#@OldAPIStack py_test( - name = "examples/ray_tune/custom_train_function", - main = "examples/ray_tune/custom_train_function.py", + name = "examples/ray_tune/custom_progres_reporter", + main = "examples/ray_tune/custom_progres_reporter.py", tags = ["team:rllib", "exclusive", "examples"], - size = "medium", - srcs = ["examples/ray_tune/custom_train_function.py"], + size = "small", + srcs = ["examples/ray_tune/custom_progres_reporter.py"], ) # subdirectory: rl_modules/ diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index fa144251959b..17d29c9effbd 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -4017,6 +4017,17 @@ def _validate_new_api_stack_settings(self): # TODO (sven): Once everything is on the new API stack, we won't need this method # anymore. def _validate_to_be_deprecated_settings(self): + # Env task fn will be deprecated. + if self._enable_new_api_stack and self.env_task_fn is not None: + deprecation_warning( + old="AlgorithmConfig.env_task_fn", + help="The `env_task_fn` API is not supported on the new API stack! " + "Curriculum learning should instead be implemented solely via " + "custom callbacks. Check out our curriculum learning example " + "script for more information: " + "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py" # noqa + ) + if self.preprocessor_pref not in ["rllib", "deepmind", None]: raise ValueError( "`config.preprocessor_pref` must be either 'rllib', 'deepmind' or None!" diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 1f326d3c002a..2f371e87ce54 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -11,32 +11,44 @@ """ -TODO: Continue docstring above +"""Example of a custom training workflow. Run this for a demo. -import argparse +This example shows: + - using Tune trainable functions to implement custom training workflows -import ray +You can visualize experiment results in ~/ray_results using TensorBoard. +""" from ray import train, tune -import ray.rllib.algorithms.ppo as ppo - -parser = argparse.ArgumentParser() -parser.add_argument("--train-iterations", type=int, default=10) - - -def experiment(config): - iterations = config.pop("train-iterations") - - algo = ppo.PPO(config=config) - checkpoint = None - train_results = {} - - # Train - for i in range(iterations): - train_results = algo.train() - if i % 2 == 0 or i == iterations - 1: - checkpoint = algo.save(train.get_context().get_trial_dir()) - train.report(train_results) - algo.stop() +from ray.rllib.algorithms.ppo import PPO, PPOConfig +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner + + +def my_experiment(config): + iterations = config.pop("train-iterations", 10) + + config = PPOConfig().update_from_dict(config).environment("CartPole-v1") + + # Train for n iterations with high LR. + config.lr = 0.01 + agent1 = config.build() + for _ in range(iterations): + result = agent1.train() + result["phase"] = 1 + train.report(result) + phase1_time = result["timesteps_total"] + state = agent1.save() + agent1.stop() + + # Train for n iterations with low LR + config.lr = 0.0001 + agent2 = config.build() + agent2.restore(state) + for _ in range(iterations): + result = agent2.train() + result["phase"] = 2 + result["timesteps_total"] += phase1_time # keep time moving forward + train.report(result) + agent2.stop() # Manual Eval config["num_workers"] = 0 @@ -57,15 +69,34 @@ def experiment(config): if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=3) base_config = ( + PPOConfig() + .experimental(_enable_new_api_stack=True) + .environment("CartPole-v1") + .rollouts( + num_rollout_workers=0, + env_runner_cls=SingleAgentEnvRunner, + ) + ) + # Convert to a plain dict for Tune. Note that this is usually not needed, you can + # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object. + # However, for demonstration purposes, we show here how you can add other, arbitrary + # keys to the plain config dict and then pass these keys to your custom experiment + # function. + config_dict = base_config.to_dict() + + # Set a Special flag signalling `my_train_fn` how many iters to do. + config_dict["train-iterations"] = 2 + + training_function = tune.with_resources( + my_experiment, + resources=base_config.algo_class.default_resource_request(base_config), ) - base_config = base_config.to_dict() - base_config["train-iterations"] = args.train_iterations - tune.Tuner( - tune.with_resources(experiment, ppo.PPO.default_resource_request(base_config)), - param_space=config, - ).fit() + tuner = tune.Tuner( + training_function, + # Pass in your config dict. + param_space=config_dict, + ) + tuner.fit() diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index e45f57b18645..89b59222a31f 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -1,50 +1,59 @@ -# TODO (sven): Move this example script into the new API stack. +"""Example showing how to define a custom Logger class for an RLlib Algorithm. -""" -This example script demonstrates how one can define a custom logger -object for any RLlib Algorithm via the Algorithm's config's `logger_config` property. -By default (logger_config=None), RLlib will construct a tune -UnifiedLogger object, which logs JSON, CSV, and TBX output. +The script uses the AlgorithmConfig's `debugging` API to setup the custom Logger: + +``` +config.debugging(logger_config={ + "type": [some Logger subclass], + "ctor_arg1", ..., + "ctor_arg2", ..., +}) +``` + +All keys other than "type" in the logger_config dict will be passed into the Logger +class's constructor. +By default (logger_config=None), RLlib will construct a Ray Tune UnifiedLogger object, +which logs results to JSON, CSV, and TBX. + +NOTE that a custom Logger is different from a custom `ProgressReporter`, which defines, +how the (frequent) outputs to your console will be formatted. To see an example on how +to write your own Progress reporter, see: +https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_progress_reporter.py # noqa Below examples include: - Disable logging entirely. - Using only one of tune's Json, CSV, or TBX loggers. - Defining a custom logger (by sub-classing tune.logger.py::Logger). -""" -import argparse -import os -from ray.rllib.utils.test_utils import check_learning_achieved +How to run this script +---------------------- +`python [script file name].py + + +Results to expect +----------------- +You should see log lines similar to the following in your console output. Note that +these logged lines will mix with the ones produced by Tune's default ProgressReporter. +See above link on how to setup a custom one. + +ABC Avg-return: 20.609375; pi-loss: -0.02921550187703246 +ABC Avg-return: 32.28688524590164; pi-loss: -0.023369029412534572 +ABC Avg-return: 51.92; pi-loss: -0.017113141975661456 +ABC Avg-return: 76.16; pi-loss: -0.01305474770361625 +ABC Avg-return: 100.54; pi-loss: -0.007665307738129169 +ABC Avg-return: 132.33; pi-loss: -0.005010405003325517 +ABC Avg-return: 169.65; pi-loss: -0.008397869592997183 +ABC Avg-return: 203.17; pi-loss: -0.005611495616764371 +Flushing +Closing + +""" + +from ray import air, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.logger import Logger, LegacyLoggerCallback -from ray.tune.registry import get_trainable_cls - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=200, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=150.0, help="Reward at which we stop training." -) class MyPrintLogger(Logger): @@ -58,7 +67,15 @@ def _init(self): def on_result(self, result: dict): # Define, what should happen on receiving a `result` (dict). - print(f"{self.prefix}: {result}") + mean_return = result["sampler_results"]["episode_reward_mean"] + pi_loss = ( + result["info"]["learner"]["default_policy"]["policy_loss"] + ) + print( + f"{self.prefix} " + f"Avg-return: {mean_return} " + f"pi-loss: {pi_loss}" + ) def close(self): # Releases all resources used by this logger. @@ -66,25 +83,15 @@ def close(self): def flush(self): # Flushing all possible disk writes to permanent storage. - print("Flushing ;)", flush=True) + print("Flushing", flush=True) if __name__ == "__main__": - import ray - from ray import air, tune - - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None) - config = ( - get_trainable_cls(args.run) - .get_default_config() - .environment( - "CartPole-v1" if args.run not in ["DDPG", "TD3"] else "Pendulum-v1" - ) - # Run with tracing enabled for tf2. - .framework(args.framework) + PPOConfig() + .experimental(_enable_new_api_stack=True) + .rollouts(env_runner_cls=SingleAgentEnvRunner) + .environment("CartPole-v1") # Setting up a custom logger config. # ---------------------------------- # The following are different examples of custom logging setups: @@ -115,27 +122,20 @@ def flush(self): # "logdir": "/somewhere/on/my/file/system/" } ) - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ) - stop = { - "training_iteration": args.stop_iters, - "timesteps_total": args.stop_timesteps, - "episode_reward_mean": args.stop_reward, - } + stop = {"sampler_results/episode_reward_mean": 200.0} - tuner = tune.Tuner( - args.run, - param_space=config.to_dict(), + # Run the actual experiment (using Tune). + results = tune.Tuner( + config.algo_class, + param_space=config, run_config=air.RunConfig( stop=stop, verbose=2, - callbacks=[LegacyLoggerCallback([MyPrintLogger])], + # Plugin our own logger. + callbacks=[ + LegacyLoggerCallback([MyPrintLogger]), + ], ), - ) - results = tuner.fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() + ).fit() diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py new file mode 100644 index 000000000000..f06f79312149 --- /dev/null +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -0,0 +1,115 @@ +"""Example showing how to set up a custom progress reporter for an RLlib Algorithm. + +The script sets the `progress_reporter` arg in the air.RunConfig and passes that to +Tune's Tuner: + +``` +tune.Tuner( + param_space=..., # <- your RLlib config + run_config=air.RunConfig( + progress_reporter=[some already instantiated TuneReporterBase object], + ), +) +``` + +By default (progress_reporter=None), Tune will construct a default `CLIReporter` object, +which reports the episode mean return, number of env steps sampled and -trained, and +the total number of episodes run thus far. + +NOTE that a custom progress reporter is different from a custom `Logger`, which defines, +how the (frequent) results are being formatted and written to e.g. a logfile. +To see an example on how to write your own Logger, see: +https://github.com/ray-project/ray/tree/master/rllib/examples/ray_tune/custom_logger.py + + +How to run this script +---------------------- +`python [script file name].py + + +Results to expect +----------------- +You should see something similar to the following in your console output: + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_bb503_00000 | TERMINATED | 127.0.0.1:26303 | 5 | 30.3823 | ++---------------------+------------+-----------------+--------+------------------+ ++-------+-------------------+------------------+------------------+------------------+ +| ts | combined return | return policy1 | return policy2 | return policy3 | +|-------+-------------------+------------------+------------------+------------------| +| 20000 | 258.7 | 103.4 | 88.84 | 87.86 | ++-------+-------------------+------------------+------------------+------------------+ + +""" +from ray import air, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole + + +my_multi_agent_progress_reporter = tune.CLIReporter( + # In the following dict, the keys are the (possibly nested) keys that can be found + # in RLlib's (PPO's) result dict, produced at every training iteration, and the + # values are the column names you would like to see in your console reports. + # Note that for nested result dict keys, you need to use slashes "/" to define the + # exact path. + metric_columns={ + **{ + "training_iteration": "iter", + "time_total_s": "total time (s)", + "timesteps_total": "ts", + # RLlib always sums up all agents' rewards and reports it under: + # result_dict[sampler_results][episode_reward_mean]. + "sampler_results/episode_reward_mean": "combined return", + }, + # Because RLlib sums up all returns of all agents, we would like to also + # see the individual agents' returns. We can find these under the result dict's + # 'policy_reward_mean' key (then the policy ID): + **{ + f"policy_reward_mean/{pid}": f"return {pid}" + for pid in ["policy1", "policy2", "policy3"] + }, + }, +) + + +if __name__ == "__main__": + # Force Tuner to use old progress output as the new one silently ignores our custom + # `CLIReporter`. + # TODO (sven): Find out why we require this hack. + import os + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + + # Register our multi-agent env with a fixed number of agents. + # The agents' IDs are 0, 1, and 2. + tune.register_env("env", lambda _: MultiAgentCartPole({"num_agents": 3})) + + config = ( + PPOConfig() + .experimental(_enable_new_api_stack=True) + .rollouts(env_runner_cls=MultiAgentEnvRunner) + .environment("env") + .multi_agent( + # Define 3 policies. Note that in our simple setup, they are all configured + # the exact same way (with a PPO default RLModule/NN). + policies={"policy1", "policy2", "policy3"}, + # Map agent 0 to "policy1", etc.. + policy_mapping_fn=lambda agent_id, episode: f"policy{agent_id + 1}", + ) + ) + + stop = {"sampler_results/episode_reward_mean": 200.0} + + # Run the actual experiment (using Tune). + results = tune.Tuner( + config.algo_class, + param_space=config, + run_config=air.RunConfig( + stop=stop, + verbose=2, + # Plugin our own progress reporter. + progress_reporter=my_multi_agent_progress_reporter, + ), + ).fit() diff --git a/rllib/examples/ray_tune/custom_train_function.py b/rllib/examples/ray_tune/custom_train_function.py deleted file mode 100644 index cf8bd4c40fbd..000000000000 --- a/rllib/examples/ray_tune/custom_train_function.py +++ /dev/null @@ -1,69 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -"""Example of a custom training workflow. Run this for a demo. - -This example shows: - - using Tune trainable functions to implement custom training workflows - -You can visualize experiment results in ~/ray_results using TensorBoard. -""" -import argparse -import os - -import ray -from ray import train, tune -from ray.rllib.algorithms.ppo import PPO, PPOConfig - -parser = argparse.ArgumentParser() -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) - - -def my_train_fn(config): - iterations = config.pop("train-iterations", 10) - - config = PPOConfig().update_from_dict(config).environment("CartPole-v1") - - # Train for n iterations with high LR. - config.lr = 0.01 - agent1 = config.build() - for _ in range(iterations): - result = agent1.train() - result["phase"] = 1 - train.report(result) - phase1_time = result["timesteps_total"] - state = agent1.save() - agent1.stop() - - # Train for n iterations with low LR - config.lr = 0.0001 - agent2 = config.build() - agent2.restore(state) - for _ in range(iterations): - result = agent2.train() - result["phase"] = 2 - result["timesteps_total"] += phase1_time # keep time moving forward - train.report(result) - agent2.stop() - - -if __name__ == "__main__": - ray.init() - args = parser.parse_args() - config = { - # Special flag signalling `my_train_fn` how many iters to do. - "train-iterations": 2, - # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. - "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), - "num_workers": 0, - "framework": args.framework, - } - resources = PPO.default_resource_request(config) - tuner = tune.Tuner( - tune.with_resources(my_train_fn, resources=resources), param_space=config - ) - tuner.fit() diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index ecd9da2f896e..14700569b09b 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1205,6 +1205,8 @@ def run_rllib_example_script_experiment( *, stop: Optional[Dict] = None, success_metric: Optional[Dict] = None, + trainable: Optional[Type] = None, + tune_callbacks: Optional[List] = None, ) -> Union[ResultDict, tune.result_grid.ResultGrid]: """Given an algorithm config and some command line args, runs an experiment. @@ -1251,6 +1253,11 @@ def run_rllib_example_script_experiment( value to be reached in order for the experiment to count as successful. If `args.as_test` is True AND this `success_metric` is not reached with the bounds defined by `stop`, will raise an Exception. + trainable: The Trainable sub-class to run in the tune.Tuner. If None (default), + use the registered RLlib Algorithm class specified by args.algo. + tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner. + In case `args.wandb_key` is provided, will append a WandB logger to this + list. Returns: The last ResultDict from a --no-tune run OR the tune.Tuner.fit() @@ -1327,19 +1334,18 @@ def run_rllib_example_script_experiment( # Run the experiment using Ray Tune. # Log results using WandB. - callbacks = None if hasattr(args, "wandb_key") and args.wandb_key is not None: project = args.wandb_project or ( args.algo.lower() + "-" + re.sub("\\W+", "-", str(config.env).lower()) ) - callbacks = [ + tune_callbacks.append( WandbLoggerCallback( api_key=args.wandb_key, project=project, upload_checkpoints=True, **({"name": args.wandb_run_name} if args.wandb_run_name else {}), ) - ] + ) # Auto-configure a CLIReporter (to log the results to the console). # Use better ProgressReporter for multi-agent cases: List individual policy rewards. @@ -1351,10 +1357,10 @@ def run_rllib_example_script_experiment( "training_iteration": "iter", "time_total_s": "total time (s)", "timesteps_total": "ts", - "sampler_results/episode_reward_mean": "combined reward", + "sampler_results/episode_reward_mean": "combined return", }, **{ - f"policy_reward_mean/{pid}": f"reward {pid}" + f"policy_reward_mean/{pid}": f"return {pid}" for pid in config.policies }, }, @@ -1366,12 +1372,12 @@ def run_rllib_example_script_experiment( # Run the actual experiment (using Tune). results = tune.Tuner( - config.algo_class, + trainable or config.algo_class, param_space=config, run_config=air.RunConfig( stop=stop, verbose=args.verbose, - callbacks=callbacks, + callbacks=tune_callbacks, checkpoint_config=air.CheckpointConfig( checkpoint_frequency=args.checkpoint_freq, checkpoint_at_end=args.checkpoint_at_end, From 2221ac6839b3712bae16a1df8eec81fe25fc482e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 14:19:38 +0200 Subject: [PATCH 07/11] wip Signed-off-by: sven1977 --- rllib/algorithms/ppo/ppo.py | 2 +- .../examples/evaluation/custom_evaluation.py | 51 ++++++++- .../evaluation_parallel_to_training.py | 103 +++++++++++++----- rllib/examples/ray_tune/custom_experiment.py | 22 ++-- rllib/examples/ray_tune/custom_logger.py | 2 +- rllib/utils/test_utils.py | 11 +- 6 files changed, 143 insertions(+), 48 deletions(-) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 9e6f55882738..361fa09c19cf 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -334,7 +334,7 @@ def validate(self) -> None: elif self._enable_new_api_stack: mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size tbs = self.train_batch_size_per_learner or self.train_batch_size - if mbs > tbs: + if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs: raise ValueError( f"`mini_batch_size_per_learner` ({mbs}) must be <= " f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch" diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index 7902ade2b520..01e2e19b1677 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -16,6 +16,51 @@ - It runs a defined number of episodes for evaluation purposes. - It collects the metrics from those runs, summarizes these metrics and returns them. + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack + +You can switch off custom evaluation (and use RLlib's default evaluation procedure) +with the `--no-custom-eval` flag. + +You can switch on parallel evaluation to training using the +`--evaluation-parallel-to-training` flag. See this example script here: +https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py # noqa +for more details on running evaluation parallel to training. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following (or very similar) console output when running this script. +Note that for each iteration, due to the definition of our custom evaluation function, +we run 3 evaluation rounds per single training round. + +... +Training iteration 1 -> evaluation round 0 +Training iteration 1 -> evaluation round 1 +Training iteration 1 -> evaluation round 2 +... +... ++--------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|--------------------------------+------------+-----------------+--------+ +| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 | 4 | ++--------------------------------+------------+-----------------+--------+ ++------------------+-------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+-------+----------+--------------------| +| 26.1973 | 16000 | 0.872034 | 13.7966 | ++------------------+-------+----------+--------------------+ """ from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -69,10 +114,10 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul # processing. rollout_metrics = [] - # For demonstration purposes, run through some arbitrary number of evaluation - # round within this one call. Note that this function is called once per + # For demonstration purposes, run through some number of evaluation + # rounds within this one call. Note that this function is called once per # training iteration (`Algorithm.train()` call) OR once per `Algorithm.evaluate()` - # (which may be called manually by the user). + # (which can be called manually by the user). for i in range(3): print(f"Training iteration {algorithm.iteration} -> evaluation round {i}") # Sample episodes from the EnvRunners AND have them return only the thus diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index 124ad92eb65f..84313a9e5ce5 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -1,6 +1,72 @@ +"""Example showing how one can set up evaluation running in parallel to training. + +Such a setup saves a considerable amount of time during RL Algorithm training, b/c +the next training step does NOT have to wait for the previous evaluation procedure to +finish, but can already start running (in parallel). + +See RLlib's documentation for more details on the effect of the different supported +evaluation configuration options: +https://docs.ray.io/en/latest/rllib/rllib-advanced-api.html#customized-evaluation-during-training # noqa + +For an example of how to write a fully customized evaluation function (which normally +is not necessary as the config options are sufficient and offer maximum flexibility), +see this example script here: + +https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py # noqa + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack` + +Use the `--evaluation-num-workers` option to scale up the evaluation workers. Note +that the requested evaluation duration (`--evaluation-duration` measured in +`--evaluation-duration-unit`, which is either "timesteps" (default) or "episodes") is +shared between all configured evaluation workers. For example, if the evaluation +duration is 10 and the unit is "episodes" and you configured 5 workers, then each of the +evaluation workers will run exactly 2 episodes. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- +You should see the following output (at the end of the experiment) in your console when +running with a fixed number of 100k training timesteps +(`--enable-new-api-stack --evaluation-duration=auto --stop-timesteps=100000 +--stop-reward=100000`): ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_1377a_00000 | TERMINATED | 127.0.0.1:73330 | 25 | ++-----------------------------+------------+-----------------+--------+ ++------------------+--------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+--------+----------+--------------------| +| 71.7485 | 100000 | 476.51 | 476.51 | ++------------------+--------+----------+--------------------+ + +When running without parallel evaluation (`--evaluation-not-parallel-to-training` flag), +the experiment takes considerably longer (~70sec vs ~80sec): ++-----------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +|-----------------------------+------------+-----------------+--------+ +| PPO_CartPole-v1_f1788_00000 | TERMINATED | 127.0.0.1:75135 | 25 | ++-----------------------------+------------+-----------------+--------+ ++------------------+--------+----------+--------------------+ +| total time (s) | ts | reward | episode_len_mean | +|------------------+--------+----------+--------------------| +| 81.7371 | 100000 | 494.68 | 494.68 | ++------------------+--------+----------+--------------------+ +""" from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, @@ -114,13 +180,10 @@ def on_train_result(self, *, algorithm, result, **kwargs): lambda _: MultiAgentCartPole(config={"num_agents": args.num_agents}), ) - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() - .experimental(_enable_new_api_stack=args.enable_new_api_stack) .environment("env" if args.num_agents > 0 else "CartPole-v1") - # Run with tracing enabled for tf2. - .framework(args.framework) # Use a custom callback that asserts that we are running the # configured exact number of episodes per evaluation OR - in auto # mode - run at least as many episodes as we have eval workers. @@ -148,28 +211,11 @@ def on_train_result(self, *, algorithm, result, **kwargs): # Switch off exploratory behavior for better (greedy) results. evaluation_config={"explore": False}, ) - .rollouts( - num_rollout_workers=args.num_env_runners, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ), - ) - .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=int(args.num_gpus != 0), - num_cpus_for_local_worker=1, - ) ) # Add a simple multi-agent setup. if args.num_agents > 0: - config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) @@ -180,4 +226,11 @@ def on_train_result(self, *, algorithm, result, **kwargs): "timesteps_total": args.stop_timesteps, } - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment( + base_config, + args, + stop=stop, + success_metric={ + "evaluation/sampler_results/episode_reward_mean": args.stop_reward, + }, + ) diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 2f371e87ce54..58c1d9b0b88d 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -2,21 +2,19 @@ You should only use such a customized workflow if the following conditions apply: - You know exactly what you are doing :) -- Simply configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig -is not enough and doesn't allow you to shape the Algorithm into behaving the way you'd -like. --- Note that for complex and custom evaluation procedures there is a RLlib Algorithm -config option (see examples/evaluation/custom_evaluation.py for more details). -- Subclassing +- Configuring an existing RLlib Algorithm (e.g. PPO) via its AlgorithmConfig +is not sufficient and doesn't allow you to shape the Algorithm into behaving the way +you'd like. Note that for complex, custom evaluation procedures there are many +AlgorithmConfig options one can use (for more details, see: +https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/custom_evaluation.py). # noqa +- Subclassing an RLlib Algorithm class and overriding the new class' `training_step` +method is not sufficient and doesn't allow you to define the algorithm's execution +logic the way you'd like. See an example here on how to customize the algorithm's +`training_step()` method: +https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py # noqa -""" - -"""Example of a custom training workflow. Run this for a demo. -This example shows: - - using Tune trainable functions to implement custom training workflows -You can visualize experiment results in ~/ray_results using TensorBoard. """ from ray import train, tune from ray.rllib.algorithms.ppo import PPO, PPOConfig diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index 89b59222a31f..b9c50d41a204 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -28,7 +28,7 @@ How to run this script ---------------------- -`python [script file name].py +`python [script file name].py` Results to expect diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 14700569b09b..69599901f9d5 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1245,14 +1245,13 @@ def run_rllib_example_script_experiment( "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, } - success_metric: Only relevent if `args.as_test` is True. + success_metric: Only relevant if `args.as_test` is True. A dict mapping a single(!) ResultDict key string (using "/" in case of nesting, e.g. "sampler_results/episode_reward_mean" for referring - to `result_dict['sampler_results']['episode_reward_mean']` to minimum - values, reaching of which will stop the experiment) to a single(!) minimum - value to be reached in order for the experiment to count as successful. - If `args.as_test` is True AND this `success_metric` is not reached with the - bounds defined by `stop`, will raise an Exception. + to `result_dict['sampler_results']['episode_reward_mean']` to a single(!) + minimum value to be reached in order for the experiment to count as + successful. If `args.as_test` is True AND this `success_metric` is not + reached with the bounds defined by `stop`, will raise an Exception. trainable: The Trainable sub-class to run in the tune.Tuner. If None (default), use the registered RLlib Algorithm class specified by args.algo. tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner. From 93491ced5543b2954b714dae48d66aee60a89356 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 15:42:59 +0200 Subject: [PATCH 08/11] wip Signed-off-by: sven1977 --- rllib/BUILD | 1 - rllib/algorithms/algorithm.py | 4 + rllib/algorithms/algorithm_config.py | 8 +- rllib/examples/ray_tune/custom_experiment.py | 161 +++++++++++++----- rllib/examples/ray_tune/custom_logger.py | 10 +- .../ray_tune/custom_progress_reporter.py | 1 + 6 files changed, 129 insertions(+), 56 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 82d844445c3d..69523777fd9e 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2840,7 +2840,6 @@ py_test( tags = ["team:rllib", "exclusive", "examples"], size = "medium", srcs = ["examples/ray_tune/custom_experiment.py"], - args = ["--train-iterations=10"] ) py_test( diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index c784e809fdd8..736d93ac18e8 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2328,6 +2328,10 @@ def load_checkpoint(self, checkpoint_dir: str) -> None: if self.config._enable_new_api_stack: learner_state_dir = os.path.join(checkpoint_dir, "learner") self.learner_group.load_state(learner_state_dir) + # Make also sure, all training EnvRunners get the just loaded weights. + weights = self.learner_group.get_weights() + self.workers.local_worker().set_weights(weights) + self.workers.sync_weights() # Call the `on_checkpoint_loaded` callback. self.callbacks.on_checkpoint_loaded(algorithm=self) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 17d29c9effbd..66127751c520 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -4022,10 +4022,10 @@ def _validate_to_be_deprecated_settings(self): deprecation_warning( old="AlgorithmConfig.env_task_fn", help="The `env_task_fn` API is not supported on the new API stack! " - "Curriculum learning should instead be implemented solely via " - "custom callbacks. Check out our curriculum learning example " - "script for more information: " - "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py" # noqa + "Curriculum learning should instead be implemented solely via " + "custom callbacks. Check out our curriculum learning example " + "script for more information: " + "https://github.com/ray-project/ray/blob/master/rllib/examples/curriculum/curriculum_learning.py", # noqa ) if self.preprocessor_pref not in ["rllib", "deepmind", None]: diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 58c1d9b0b88d..330c96ab5c21 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -16,58 +16,127 @@ """ +from typing import Dict + +import numpy as np from ray import train, tune -from ray.rllib.algorithms.ppo import PPO, PPOConfig +from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() -def my_experiment(config): - iterations = config.pop("train-iterations", 10) +def my_experiment(config: Dict): - config = PPOConfig().update_from_dict(config).environment("CartPole-v1") + # Extract the number of iterations to run from the config. + train_iterations = config.pop("train-iterations", 2) + eval_episodes_to_do = config.pop("eval-episodes", 1) + + config = ( + PPOConfig() + .update_from_dict(config) + .experimental(_enable_new_api_stack=True) + .environment("CartPole-v1") + ) # Train for n iterations with high LR. - config.lr = 0.01 - agent1 = config.build() - for _ in range(iterations): - result = agent1.train() - result["phase"] = 1 - train.report(result) - phase1_time = result["timesteps_total"] - state = agent1.save() - agent1.stop() - - # Train for n iterations with low LR - config.lr = 0.0001 - agent2 = config.build() - agent2.restore(state) - for _ in range(iterations): - result = agent2.train() - result["phase"] = 2 - result["timesteps_total"] += phase1_time # keep time moving forward - train.report(result) - agent2.stop() - - # Manual Eval - config["num_workers"] = 0 - eval_algo = ppo.PPO(config=config) - eval_algo.restore(checkpoint) - env = eval_algo.workers.local_worker().env - - obs, info = env.reset() - done = False - eval_results = {"eval_reward": 0, "eval_eps_length": 0} - while not done: - action = eval_algo.compute_single_action(obs) - next_obs, reward, done, truncated, info = env.step(action) - eval_results["eval_reward"] += reward - eval_results["eval_eps_length"] += 1 + config.training(lr=0.001) + algo_high_lr = config.build() + for _ in range(train_iterations): + train_results = algo_high_lr.train() + # Add the phase to the result dict. + train_results["phase"] = 1 + train.report(train_results) + phase_high_lr_time = train_results["timesteps_total"] + checkpoint_training_high_lr = algo_high_lr.save() + algo_high_lr.stop() + + # Train for n iterations with low LR. + config.training(lr=0.00001) + algo_low_lr = config.build() + # Load state from the high-lr algo into this one. + algo_low_lr.restore(checkpoint_training_high_lr) + for _ in range(train_iterations): + train_results = algo_low_lr.train() + # Add the phase to the result dict. + train_results["phase"] = 2 + # keep time moving forward + train_results["timesteps_total"] += phase_high_lr_time + train.report(train_results) + + print( + f"after low lr training: episode return={train_results['episode_reward_mean']}" + ) + print(f"model weights={algo_low_lr.workers.local_worker().module.get_state()}") + checkpoint_training_low_lr = algo_low_lr.save() + algo_low_lr.stop() + + # After training, run a manual evaluation procedure. + + # Set the number of EnvRunners for collecting training data to 0 (local + # worker only). + config.rollouts(num_rollout_workers=0) + + eval_algo = config.build() + # Load state from the low-lr algo into this one. + eval_algo.restore(checkpoint_training_low_lr) + # The algo's local worker (SingleAgentEnvRunner) that holds a + # gym.vector.Env object and an RLModule for computing actions. + local_env_runner = eval_algo.workers.local_worker() + # Extract the gymnasium env object from the created algo (its local + # SingleAgentEnvRunner worker). Note that the env in this single-agent + # case is a gymnasium vector env and that we get its first sub-env here. + env = local_env_runner.env.envs[0] + + # The local worker (SingleAgentEnvRunner) + rl_module = local_env_runner.module + + print(f"LOADED eval model weights={rl_module.get_state()}") + + # Run a very simple env loop and add up rewards over a single episode. + obs, infos = env.reset() + episode_returns = [] + episode_lengths = [] + sum_rewards = length = 0 + num_episodes = 0 + while num_episodes < eval_episodes_to_do: + # Call the RLModule's `forward_inference()` method to compute an + # action. + rl_module_out = rl_module.forward_inference( + { + "obs": torch.from_numpy(np.expand_dims(obs, 0)), # <- add B=1 + } + ) + action_logits = rl_module_out["action_dist_inputs"][0] # <- remove B=1 + action = np.argmax(action_logits.detach().cpu().numpy()) # act greedily + + # Step the env. + obs, reward, terminated, truncated, info = env.step(action) + + # Acculumate stats and reset the env, if necessary. + sum_rewards += reward + length += 1 + if terminated or truncated: + num_episodes += 1 + episode_returns.append(sum_rewards) + episode_lengths.append(length) + sum_rewards = length = 0 + obs, infos = env.reset() + + # Compile evaluation results. + eval_results = { + "eval_returns": episode_returns, + "eval_episode_lengths": episode_lengths, + } + # Combine the most recent training results with the just collected + # evaluation results. results = {**train_results, **eval_results} + # Report everything. train.report(results) if __name__ == "__main__": - base_config = ( PPOConfig() .experimental(_enable_new_api_stack=True) @@ -84,8 +153,11 @@ def my_experiment(config): # function. config_dict = base_config.to_dict() - # Set a Special flag signalling `my_train_fn` how many iters to do. - config_dict["train-iterations"] = 2 + # Set a Special flag signalling `my_experiment` how many training steps to + # perform on each: the high learning rate and low learning rate. + config_dict["train-iterations"] = 5 + # Set a Special flag signalling `my_experiment` how many episodes to evaluate for. + config_dict["eval-episodes"] = 3 training_function = tune.with_resources( my_experiment, @@ -97,4 +169,7 @@ def my_experiment(config): # Pass in your config dict. param_space=config_dict, ) - tuner.fit() + results = tuner.fit() + best_results = results.get_best_result() + + print(f"evaluation episode returns={best_results.metrics['eval_returns']}") diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index b9c50d41a204..f2988bcbd6c4 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -68,14 +68,8 @@ def _init(self): def on_result(self, result: dict): # Define, what should happen on receiving a `result` (dict). mean_return = result["sampler_results"]["episode_reward_mean"] - pi_loss = ( - result["info"]["learner"]["default_policy"]["policy_loss"] - ) - print( - f"{self.prefix} " - f"Avg-return: {mean_return} " - f"pi-loss: {pi_loss}" - ) + pi_loss = result["info"]["learner"]["default_policy"]["policy_loss"] + print(f"{self.prefix} " f"Avg-return: {mean_return} " f"pi-loss: {pi_loss}") def close(self): # Releases all resources used by this logger. diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index f06f79312149..01244f8deb71 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -80,6 +80,7 @@ # `CLIReporter`. # TODO (sven): Find out why we require this hack. import os + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" # Register our multi-agent env with a fixed number of agents. From cf44987ddf56a94f9b6e4bae25a7ff9a13221211 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 16:36:00 +0200 Subject: [PATCH 09/11] wip Signed-off-by: sven1977 --- rllib/algorithms/ppo/ppo.py | 2 +- rllib/examples/ray_tune/custom_experiment.py | 30 +++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 73001b31b330..8d5157c7bc4b 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -334,7 +334,7 @@ def validate(self) -> None: elif self._enable_new_api_stack: mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size tbs = self.train_batch_size_per_learner or self.train_batch_size - if isinstance(mbs, int) and isinstance(tbs, int) and mbs > tbs: + if mbs > tbs: raise ValueError( f"`mini_batch_size_per_learner` ({mbs}) must be <= " f"`train_batch_size_per_learner` ({tbs}). In PPO, the train batch" diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 330c96ab5c21..35464f66667b 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -14,7 +14,29 @@ https://github.com/ray-project/ray/blob/master/rllib/examples/algorithm/custom_training_step_on_and_off_policy_combined.py # noqa - +How to run this script +---------------------- +`python [script file name].py` + + +Results to expect +----------------- +You should see the following output (at the end of the experiment) in your console: + +╭─────────────────────────────────────────────────────────────────────────────────────── +│ Trial name status iter total time (s) ts +├─────────────────────────────────────────────────────────────────────────────────────── +│ my_experiment_CartPole-v1_77083_00000 TERMINATED 10 36.7799 60000 +╰─────────────────────────────────────────────────────────────────────────────────────── +╭───────────────────────────────────────────────────────╮ +│ reward episode_len_mean episodes_this_iter │ +├───────────────────────────────────────────────────────┤ +│ 254.821 254.821 12 │ +╰───────────────────────────────────────────────────────╯ +evaluation episode returns=[500.0, 500.0, 500.0] + +Note that evaluation results (on the CartPole-v1 env) should be close to perfect +(episode return of ~500.0) as we are acting greedily inside the evaluation procedure. """ from typing import Dict @@ -65,10 +87,6 @@ def my_experiment(config: Dict): train_results["timesteps_total"] += phase_high_lr_time train.report(train_results) - print( - f"after low lr training: episode return={train_results['episode_reward_mean']}" - ) - print(f"model weights={algo_low_lr.workers.local_worker().module.get_state()}") checkpoint_training_low_lr = algo_low_lr.save() algo_low_lr.stop() @@ -92,8 +110,6 @@ def my_experiment(config: Dict): # The local worker (SingleAgentEnvRunner) rl_module = local_env_runner.module - print(f"LOADED eval model weights={rl_module.get_state()}") - # Run a very simple env loop and add up rewards over a single episode. obs, infos = env.reset() episode_returns = [] From 9f872667108eaaf93f072b8f1affa7182dbe1449 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 19:24:53 +0200 Subject: [PATCH 10/11] fixes Signed-off-by: sven1977 --- rllib/BUILD | 6 +++--- rllib/tests/test_ray_client.py | 33 ++++----------------------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 69523777fd9e..6e54ec137224 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2851,11 +2851,11 @@ py_test( ) py_test( - name = "examples/ray_tune/custom_progres_reporter", - main = "examples/ray_tune/custom_progres_reporter.py", + name = "examples/ray_tune/custom_progress_reporter", + main = "examples/ray_tune/custom_progress_reporter.py", tags = ["team:rllib", "exclusive", "examples"], size = "small", - srcs = ["examples/ray_tune/custom_progres_reporter.py"], + srcs = ["examples/ray_tune/custom_progress_reporter.py"], ) # subdirectory: rl_modules/ diff --git a/rllib/tests/test_ray_client.py b/rllib/tests/test_ray_client.py index b60e8b1f1b3e..06d573537607 100644 --- a/rllib/tests/test_ray_client.py +++ b/rllib/tests/test_ray_client.py @@ -14,12 +14,12 @@ def test_connection(self): assert ray.util.client.ray.is_connected() assert ray.util.client.ray.is_connected() is False - def test_custom_train_function(self): + def test_custom_experiment(self): with ray_start_client_server(): assert ray.util.client.ray.is_connected() config = { - # Special flag signalling `my_train_fn` how many iters to do. + # Special flag signalling `my_experiment` how many iters to do. "train-iterations": 2, "lr": 0.01, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. @@ -28,10 +28,10 @@ def test_custom_train_function(self): "framework": "tf", } resources = ppo.PPO.default_resource_request(config) - from ray.rllib.examples.ray_tune.custom_train_function import my_train_fn + from ray.rllib.examples.ray_tune.custom_experiment import my_experiment tune.Tuner( - tune.with_resources(my_train_fn, resources), + tune.with_resources(my_experiment, resources), param_space=config, ).fit() @@ -53,31 +53,6 @@ def test_cartpole_lstm(self): run_config=air.RunConfig(stop=stop, verbose=2), ).fit() - def test_custom_experiment(self): - - with ray_start_client_server(ray_init_kwargs={"num_cpus": 3}): - assert ray.util.client.ray.is_connected() - - config = ppo.PPOConfig().environment("CartPole-v1") - # Special flag signalling `experiment` how many iters to do. - config = config.to_dict() - config["train-iterations"] = 2 - - from ray.rllib.examples.ray_tune.custom_experiment import experiment - - # Ray client does not seem to propagate the `fn._resources` property - # correctly for imported functions. As a workaround, we can wrap the - # imported function which forces a full transfer. - def wrapped_experiment(config): - experiment(config) - - tune.Tuner( - tune.with_resources( - wrapped_experiment, ppo.PPO.default_resource_request(config) - ), - param_space=config, - ).fit() - if __name__ == "__main__": import pytest From 8a1002a00d0a144e662576a12e3a17b1c9e21779 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 15 Apr 2024 22:25:46 +0200 Subject: [PATCH 11/11] fix Signed-off-by: sven1977 --- rllib/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/BUILD b/rllib/BUILD index 6e54ec137224..729f8b1c353e 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2854,7 +2854,7 @@ py_test( name = "examples/ray_tune/custom_progress_reporter", main = "examples/ray_tune/custom_progress_reporter.py", tags = ["team:rllib", "exclusive", "examples"], - size = "small", + size = "medium", srcs = ["examples/ray_tune/custom_progress_reporter.py"], )