From cb937c6d099e964aa541f5f08e8f3838534f3ea1 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 04:33:17 +0200 Subject: [PATCH 01/23] wip Signed-off-by: sven1977 --- rllib/examples/gpus/fractional_gpus.py | 136 ------------------ .../gpus/fractional_gpus_per_learner.py | 96 +++++++++++++ 2 files changed, 96 insertions(+), 136 deletions(-) delete mode 100644 rllib/examples/gpus/fractional_gpus.py create mode 100644 rllib/examples/gpus/fractional_gpus_per_learner.py diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py deleted file mode 100644 index 3895b4772e03..000000000000 --- a/rllib/examples/gpus/fractional_gpus.py +++ /dev/null @@ -1,136 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -"""Example of a custom gym environment and model. Run this for a demo. - -This example shows: - - using a custom environment - - using a custom model - - using Tune for grid search - -You can visualize experiment results in ~/ray_results using TensorBoard. -""" -import argparse - -import ray -from ray import air, tune -from ray.air.constants import TRAINING_ITERATION -from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import get_trainable_cls - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument("--num-gpus", type=float, default=0.5) -parser.add_argument("--num-workers", type=int, default=1) -parser.add_argument("--num-gpus-per-worker", type=float, default=0.0) -parser.add_argument("--num-envs-per-worker", type=int, default=1) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=180.0, help="Reward at which we stop training." -) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=4) - - # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), - # where ray was started using only one of these GPUs: - # $ ray start --num-gpus=1 --head - - # Note: A strange error could occur when using tf: - # "NotImplementedError: Cannot convert a symbolic Tensor - # (default_policy/cond/strided_slice:0) to a numpy array." - # In rllib/utils/exploration/random.py. - # Fix: Install numpy version 1.19.5. - - # Tested arg combinations (4 tune trials will be setup; see - # tune.grid_search over 4 learning rates below): - # - num_gpus=0.5 (2 tune trials should run in parallel). - # - num_gpus=0.3 (3 tune trials should run in parallel). - # - num_gpus=0.25 (4 tune trials should run in parallel) - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 - # -> 3 tune trials should run in parallel. - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 - # -> 2 tune trials should run in parallel. - # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 - # -> 1 tune trial should run in parallel. - - config = ( - get_trainable_cls(args.run) - .get_default_config() - # Setup the test env as one that requires a GPU, iff - # num_gpus_per_worker > 0. - .environment( - GPURequiringEnv if args.num_gpus_per_worker > 0.0 else "CartPole-v1" - ) - .framework(args.framework) - .resources( - # How many GPUs does the local worker (driver) need? For most algos, - # this is where the learning updates happen. - # Set this to > 1 for multi-GPU learning. - num_gpus=args.num_gpus, - # How many GPUs does each RolloutWorker (`num_workers`) need? - num_gpus_per_worker=args.num_gpus_per_worker, - ) - # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_env_runner`)? - .env_runners( - num_env_runners=args.num_workers, - # This setting should not really matter as it does not affect the - # number of GPUs reserved for each worker. - num_envs_per_env_runner=args.num_envs_per_worker, - ) - # 4 tune trials altogether. - .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) - ) - - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - # Note: The above GPU settings should also work in case you are not - # running via ``Tuner.fit()``, but instead do: - - # >> from ray.rllib.algorithms.ppo import PPO - # >> algo = PPO(config=config) - # >> for _ in range(10): - # >> results = algo.train() - # >> print(results) - - results = tune.Tuner( - args.run, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py new file mode 100644 index 000000000000..5aa1eb8e1f81 --- /dev/null +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -0,0 +1,96 @@ +"""Example of using fractional GPUs (< 1.0) per Learner worker. + +This example: + - shows how to setup an Algorithm that uses one or more Learner workers ... + - ... and assigns a fractional (< 1.0) number of GPUs to each of these Learners. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-learner-workers= +[number of Learner workers, e.g. 1] --num-gpus-per-learner [some fraction <1.0]` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. +""" +from ray import tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=180, default_timesteps=100000 +) +parser.add_argument("--num-learners", type=int, default=1) +parser.add_argument("--num-gpus-per-learner", type=float, default=0.5) + + +if __name__ == "__main__": + args = parser.parse_args() + + # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), + # where ray was started using only one of these GPUs: + # $ ray start --num-gpus=1 --head + + # Tested arg combinations (4 tune trials will be setup; see + # tune.grid_search over 4 learning rates below): + # - num_gpus=0.5 (2 tune trials should run in parallel). + # - num_gpus=0.3 (3 tune trials should run in parallel). + # - num_gpus=0.25 (4 tune trials should run in parallel) + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 + # -> 3 tune trials should run in parallel. + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 + # -> 2 tune trials should run in parallel. + # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 + # -> 1 tune trial should run in parallel. + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("CartPole-v1") + .learners(num_learners=args.num_learners) + .resources( + num_learner_workers=args.num_learners, + # How many GPUs does the local worker (driver) need? For most algos, + # this is where the learning updates happen. + # Set this to > 1 for multi-GPU learning. + num_gpus=args.num_gpus, + # How many GPUs does each RolloutWorker (`num_workers`) need? + num_gpus_per_worker=args.num_gpus_per_worker, + ) + # 4 tune trials altogether. + .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + # Note: The above GPU settings should also work in case you are not + # running via ``Tuner.fit()``, but instead do: + + # >> from ray.rllib.algorithms.ppo import PPO + # >> algo = PPO(config=config) + # >> for _ in range(10): + # >> results = algo.train() + # >> print(results) + + run_rllib_example_script_experiment(base_config, args) From c931bedc1a9438219d54d5662375d9343c4c423e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 04:33:28 +0200 Subject: [PATCH 02/23] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 323 ++++++++++++++++++++------- 1 file changed, 245 insertions(+), 78 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index b7a05eaa395d..8ac662b1171a 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -265,17 +265,10 @@ def __init__(self, algo_class: Optional[type] = None): self.extra_python_environs_for_worker = {} # `self.resources()` - self.num_gpus = 0 - self.num_cpus_per_worker = 1 - self.num_gpus_per_worker = 0 - self._fake_gpus = False - self.num_cpus_for_local_worker = 1 - self.num_learner_workers = 0 - self.num_gpus_per_learner_worker = 0 - self.num_cpus_per_learner_worker = 1 - self.local_gpu_idx = 0 - self.custom_resources_per_worker = {} self.placement_strategy = "PACK" + self.num_gpus = 0 # @OldAPIStack + self._fake_gpus = False # @OldAPIStack + self.num_cpus_for_local_worker = 1 # `self.framework()` self.framework_str = "torch" @@ -338,6 +331,9 @@ def __init__(self, algo_class: Optional[type] = None): self.env_runner_cls = None self.num_env_runners = 0 self.num_envs_per_env_runner = 1 + self.num_cpus_per_env_runner = 1 + self.num_gpus_per_env_runner = 0 + self.custom_resources_per_env_runner = {} self.validate_env_runners_after_construction = True self.sample_timeout_s = 60.0 self.create_env_on_local_worker = False @@ -365,6 +361,12 @@ def __init__(self, algo_class: Optional[type] = None): self.enable_connectors = True self.sampler_perf_stats_ema_coef = None + # `self.learners()` + self.num_learners = 0 + self.num_gpus_per_learner = 0 + self.num_cpus_per_learner = 1 + self.local_gpu_idx = 0 + # `self.training()` self.gamma = 0.99 self.lr = 0.001 @@ -1207,17 +1209,18 @@ def python_environment( def resources( self, *, - num_gpus: Optional[Union[float, int]] = NotProvided, - _fake_gpus: Optional[bool] = NotProvided, - num_cpus_per_worker: Optional[Union[float, int]] = NotProvided, - num_gpus_per_worker: Optional[Union[float, int]] = NotProvided, + num_gpus: Optional[Union[float, int]] = NotProvided, # @OldAPIStack + _fake_gpus: Optional[bool] = NotProvided, # @OldAPIStack num_cpus_for_local_worker: Optional[int] = NotProvided, - num_learner_workers: Optional[int] = NotProvided, - num_cpus_per_learner_worker: Optional[Union[float, int]] = NotProvided, - num_gpus_per_learner_worker: Optional[Union[float, int]] = NotProvided, - local_gpu_idx: Optional[int] = NotProvided, - custom_resources_per_worker: Optional[dict] = NotProvided, placement_strategy: Optional[str] = NotProvided, + # Deprecated args. + num_cpus_per_worker=DEPRECATED_VALUE, # moved to `env_runners` + num_gpus_per_worker=DEPRECATED_VALUE, # moved to `env_runners` + custom_resources_per_worker=DEPRECATED_VALUE, # moved to `env_runners` + num_learner_workers=DEPRECATED_VALUE, # moved to `learners` + num_cpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` + num_gpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` + local_gpu_idx=DEPRECATED_VALUE # moved to `learners` ) -> "AlgorithmConfig": """Specifies resources allocated for an Algorithm and its ray actors/workers. @@ -1230,45 +1233,18 @@ def resources( CPU machine. GPU towers will be simulated by graphs located on CPUs in this case. Use `num_gpus` to test for different numbers of fake GPUs. - num_cpus_per_worker: Number of CPUs to allocate per worker. - num_gpus_per_worker: Number of GPUs to allocate per worker. This can be - fractional. This is usually needed only if your env itself requires a - GPU (i.e., it is a GPU-intensive video game), or model inference is - unusually expensive. - num_learner_workers: Number of workers used for training. A value of 0 - means training will take place on a local worker on head node CPUs or 1 - GPU (determined by `num_gpus_per_learner_worker`). For multi-gpu - training, set number of workers greater than 1 and set - `num_gpus_per_learner_worker` accordingly (e.g. 4 GPUs total, and model - needs 2 GPUs: `num_learner_workers = 2` and - `num_gpus_per_learner_worker = 2`) - num_cpus_per_learner_worker: Number of CPUs allocated per Learner worker. - Only necessary for custom processing pipeline inside each Learner - requiring multiple CPU cores. Ignored if `num_learner_workers = 0`. - num_gpus_per_learner_worker: Number of GPUs allocated per worker. If - `num_learner_workers = 0`, any value greater than 0 will run the - training on a single GPU on the head node, while a value of 0 will run - the training on head node CPU cores. If `num_gpus_per_learner_worker` is - set to > 0, then `num_cpus_per_learner_worker` should not be changed - (from its default value of 1). num_cpus_for_local_worker: Number of CPUs to allocate for the algorithm. Note: this only takes effect when running in Tune. Otherwise, the algorithm runs in the main program (driver). - local_gpu_idx: If `num_gpus_per_learner_worker` > 0, and - `num_learner_workers` < 2, then this GPU index will be used for - training. This is an index into the available - CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` - then a `local_gpu_idx` of 0 will use the GPU with ID=1 on the node. - custom_resources_per_worker: Any custom Ray resources to allocate per - worker. placement_strategy: The strategy for the placement group factory returned by `Algorithm.default_resource_request()`. A PlacementGroup defines, which devices (resources) should always be co-located on the same node. - For example, an Algorithm with 2 rollout workers, running with - num_gpus=1 will request a placement group with the bundles: - [{"gpu": 1, "cpu": 1}, {"cpu": 1}, {"cpu": 1}], where the first bundle - is for the driver and the other 2 bundles are for the two workers. - These bundles can now be "placed" on the same or different + For example, an Algorithm with 2 EnvRunners and 1 Learner (with + 1 GPU) will request a placement group with the bundles: + [{"cpu": 1}, {"gpu": 1, "cpu": 1}, {"cpu": 1}, {"cpu": 1}], where the + first bundle is for the local (main Algorithm) process, the secon one + for the 1 Learner worker and the last 2 bundles are for the two + EnvRunners. These bundles can now be "placed" on the same or different nodes depending on the value of `placement_strategy`: "PACK": Packs bundles into as few nodes as possible. "SPREAD": Places bundles across distinct nodes as even as possible. @@ -1279,30 +1255,71 @@ def resources( Returns: This updated AlgorithmConfig object. """ + if num_cpus_per_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_cpus_per_worker)", + new="AlgorithmConfig.env_runners(num_cpus_per_env_runner)", + error=False, + ) + self.num_cpus_per_env_runner = num_cpus_per_worker + + if num_gpus_per_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_gpus_per_worker)", + new="AlgorithmConfig.env_runners(num_gpus_per_env_runner)", + error=False, + ) + self.num_gpus_per_env_runner = num_gpus_per_worker + + if custom_resources_per_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(custom_resources_per_worker)", + new="AlgorithmConfig.env_runners(custom_resources_per_env_runner)", + error=False, + ) + self.custom_resources_per_env_runner = custom_resources_per_worker + + if num_learner_workers != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_learner_workers)", + new="AlgorithmConfig.learners(num_learner)", + error=False, + ) + self.num_learners = num_learner_workers + + if num_cpus_per_learner_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_cpus_per_learner_worker)", + new="AlgorithmConfig.learners(num_cpus_per_learner)", + error=False, + ) + self.num_cpus_per_learner = num_cpus_per_learner_worker + + if num_gpus_per_learner_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_gpus_per_learner_worker)", + new="AlgorithmConfig.learners(num_gpus_per_learner)", + error=False, + ) + self.num_gpus_per_learner = num_gpus_per_learner_worker + + if local_gpu_idx != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(local_gpu_idx)", + new="AlgorithmConfig.learners(local_gpu_idx)", + error=False, + ) + self.local_gpu_idx = local_gpu_idx + if num_gpus is not NotProvided: self.num_gpus = num_gpus if _fake_gpus is not NotProvided: self._fake_gpus = _fake_gpus - if num_cpus_per_worker is not NotProvided: - self.num_cpus_per_worker = num_cpus_per_worker - if num_gpus_per_worker is not NotProvided: - self.num_gpus_per_worker = num_gpus_per_worker if num_cpus_for_local_worker is not NotProvided: self.num_cpus_for_local_worker = num_cpus_for_local_worker - if custom_resources_per_worker is not NotProvided: - self.custom_resources_per_worker = custom_resources_per_worker if placement_strategy is not NotProvided: self.placement_strategy = placement_strategy - if num_learner_workers is not NotProvided: - self.num_learner_workers = num_learner_workers - if num_cpus_per_learner_worker is not NotProvided: - self.num_cpus_per_learner_worker = num_cpus_per_learner_worker - if num_gpus_per_learner_worker is not NotProvided: - self.num_gpus_per_learner_worker = num_gpus_per_learner_worker - if local_gpu_idx is not NotProvided: - self.local_gpu_idx = local_gpu_idx - return self def framework( @@ -1556,6 +1573,9 @@ def env_runners( env_runner_cls: Optional[type] = NotProvided, num_env_runners: Optional[int] = NotProvided, num_envs_per_env_runner: Optional[int] = NotProvided, + num_cpus_per_env_runner: Optional[int] = NotProvided, + num_gpus_per_env_runner: Optional[Union[float, int]] = NotProvided, + custom_resources_per_env_runner: Optional[dict] = NotProvided, validate_env_runners_after_construction: Optional[bool] = NotProvided, sample_timeout_s: Optional[float] = NotProvided, env_to_module_connector: Optional[ @@ -1608,6 +1628,13 @@ def env_runners( (vector-wise) per EnvRunner. This enables batching when computing actions through RLModule inference, which can improve performance for inference-bottlenecked workloads. + num_cpus_per_env_runner: Number of CPUs to allocate per EnvRunner. + num_gpus_per_env_runner: Number of GPUs to allocate per EnvRunner. This can + be fractional. This is usually needed only if your env itself requires a + GPU (i.e., it is a GPU-intensive video game), or model inference is + unusually expensive. + custom_resources_per_env_runner: Any custom Ray resources to allocate per + EnvRunner. sample_timeout_s: The timeout in seconds for calling `sample()` on remote EnvRunner workers. Results (episode list) from workers that take longer than this time are discarded. Only used by algorithms that sample @@ -1770,6 +1797,14 @@ def env_runners( "larger 0!" ) self.num_envs_per_env_runner = num_envs_per_env_runner + + if num_cpus_per_env_runner is not NotProvided: + self.num_cpus_per_env_runner = num_cpus_per_env_runner + if num_gpus_per_env_runner is not NotProvided: + self.num_gpus_per_env_runner = num_gpus_per_env_runner + if custom_resources_per_env_runner is not NotProvided: + self.custom_resources_per_env_runner = custom_resources_per_env_runner + if sample_timeout_s is not NotProvided: self.sample_timeout_s = sample_timeout_s if sample_collector is not NotProvided: @@ -1901,6 +1936,53 @@ def env_runners( return self + def learners( + self, + *, + num_learners: Optional[int] = NotProvided, + num_cpus_per_learner: Optional[Union[float, int]] = NotProvided, + num_gpus_per_learner: Optional[Union[float, int]] = NotProvided, + local_gpu_idx: Optional[int] = NotProvided, + ): + """Sets Learner worker related configuration. + + Args: + num_learners: Number of Learner workers used for updating the RLModule. + A value of 0 means training will take place on a local Learner on main + process CPUs or 1 GPU (determined by `num_gpus_per_learner`). + For multi-gpu training, you have to set `num_learners` to > 1 and set + `num_gpus_per_learner` accordingly (e.g. 4 GPUs total and model fits on + 1 GPU: `num_learners=4; num_gpus_per_learner=1` OR 4 GPUs total and + model requires 2 GPUs: `num_learners=2; num_gpus_per_learner=2`). + num_cpus_per_learner: Number of CPUs allocated per Learner worker. + Only necessary for custom processing pipeline inside each Learner + requiring multiple CPU cores. Ignored if `num_learners=0`. + num_gpus_per_learner: Number of GPUs allocated per Learner worker. If + `num_learners=0`, any value greater than 0 will run the + training on a single GPU on the main process, while a value of 0 will + run the training on main process CPUs. If `num_gpus_per_learner` is + > 0, then `num_cpus_per_learner` should not be changed (from its default + value of 1). + local_gpu_idx: If `num_gpus_per_learner` > 0, and + `num_learners` < 2, then this GPU index will be used for + training. This is an index into the available + CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` + then a `local_gpu_idx` of 0 will use the GPU with ID=1 on the node. + + Returns: + This updated AlgorithmConfig object. + """ + if num_learners is not NotProvided: + self.num_learners = num_learners + if num_cpus_per_learner is not NotProvided: + self.num_cpus_per_learner = num_cpus_per_learner + if num_gpus_per_learner is not NotProvided: + self.num_gpus_per_learner = num_gpus_per_learner + if local_gpu_idx is not NotProvided: + self.local_gpu_idx = local_gpu_idx + + return self + def training( self, *, @@ -1960,13 +2042,13 @@ def training( train_batch_size_per_learner: Train batch size per individual Learner worker. This setting only applies to the new API stack. The number of Learner workers can be set via `config.resources( - num_learner_workers=...)`. The total effective batch size is then - `num_learner_workers` x `train_batch_size_per_learner` and can + num_learners=...)`. The total effective batch size is then + `num_learners` x `train_batch_size_per_learner` and can be accessed via the property `AlgorithmConfig.total_train_batch_size`. train_batch_size: Training batch size, if applicable. When on the new API stack, this setting should no longer be used. Instead, use `train_batch_size_per_learner` (in combination with - `num_learner_workers`). + `num_learners`). model: Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. TODO: Provide ModelConfig objects instead of dicts. @@ -3031,7 +3113,7 @@ def is_atari(self) -> bool: @property def total_train_batch_size(self): if self.train_batch_size_per_learner is not None: - return self.train_batch_size_per_learner * (self.num_learner_workers or 1) + return self.train_batch_size_per_learner * (self.num_learners or 1) else: return self.train_batch_size @@ -3443,7 +3525,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: ) raise ValueError( "Your desired `total_train_batch_size` " - f"({self.total_train_batch_size}={self.num_learner_workers} " + f"({self.total_train_batch_size}={self.num_learners} " f"learners x {self.train_batch_size_per_learner}) " "or a value 10% off of that cannot be achieved with your other " f"settings (num_env_runners={self.num_env_runners}; " @@ -3916,23 +3998,23 @@ def _validate_resources_settings(self): # Remove this once we are able to specify placement group bundle index in RLlib if ( self.num_cpus_per_learner_worker > 1 - and self.num_gpus_per_learner_worker > 0 + and self.num_gpus_per_learner > 0 ): raise ValueError( "Cannot set both `num_cpus_per_learner_worker` > 1 and " - " `num_gpus_per_learner_worker` > 0! Either set " - "`num_cpus_per_learner_worker` > 1 (and `num_gpus_per_learner_worker`" - "=0) OR set `num_gpus_per_learner_worker` > 0 (and leave " + " `num_gpus_per_learner` > 0! Either set " + "`num_cpus_per_learner_worker` > 1 (and `num_gpus_per_learner`" + "=0) OR set `num_gpus_per_learner` > 0 (and leave " "`num_cpus_per_learner_worker` at its default value of 1). " "This is due to issues with placement group fragmentation. See " "https://github.com/ray-project/ray/issues/35409 for more details." ) # Make sure the resource requirements for learner_group is valid. - if self.num_learner_workers == 0 and self.num_gpus_per_worker > 1: + if self.num_learners == 0 and self.num_gpus_per_env_runner > 1: raise ValueError( "num_gpus_per_worker must be 0 (cpu) or 1 (gpu) when using local mode " - "(i.e. num_learner_workers = 0)" + "(i.e. num_learners = 0)" ) def _validate_multi_agent_settings(self): @@ -4637,6 +4719,91 @@ def validate_workers_after_construction(self, value): ) self.validate_env_runners_after_construction = value + # Cleanups from `resources()`. + @property + @Deprecated(new="AlgorithmConfig.num_cpus_per_env_runner", error=False) + def num_cpus_per_worker(self): + return self.num_cpus_per_env_runner + + @num_cpus_per_worker.setter + def num_cpus_per_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.num_cpus_per_worker", + new="AlgorithmConfig.num_cpus_per_env_runner", + error=False, + ) + self.num_cpus_per_env_runner = value + + @property + @Deprecated(new="AlgorithmConfig.num_gpus_per_env_runner", error=False) + def num_gpus_per_worker(self): + return self.num_gpus_per_env_runner + + @num_gpus_per_worker.setter + def num_gpus_per_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.num_gpus_per_worker", + new="AlgorithmConfig.num_gpus_per_env_runner", + error=False, + ) + self.num_gpus_per_env_runner = value + + @property + @Deprecated(new="AlgorithmConfig.custom_resources_per_env_runner", error=False) + def custom_resources_per_worker(self): + return self.custom_resources_per_env_runner + + @custom_resources_per_worker.setter + def custom_resources_per_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.custom_resources_per_worker", + new="AlgorithmConfig.custom_resources_per_env_runner", + error=False, + ) + self.custom_resources_per_env_runner = value + + @property + @Deprecated(new="AlgorithmConfig.num_learners", error=False) + def num_learner_workers(self): + return self.num_learners + + @num_learner_workers.setter + def num_learner_workers(self, value): + deprecation_warning( + old="AlgorithmConfig.num_learner_workers", + new="AlgorithmConfig.num_learners", + error=False, + ) + self.num_learners = value + + @property + @Deprecated(new="AlgorithmConfig.num_cpus_per_learner", error=False) + def num_cpus_per_learner_worker(self): + return self.num_cpus_per_learner + + @num_cpus_per_learner_worker.setter + def num_cpus_per_learner_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.num_cpus_per_learner_worker", + new="AlgorithmConfig.num_cpus_per_learner", + error=False, + ) + self.num_cpus_per_learner = value + + @property + @Deprecated(new="AlgorithmConfig.num_gpus_per_learner", error=False) + def num_gpus_per_learner_worker(self): + return self.num_gpus_per_learner + + @num_gpus_per_learner_worker.setter + def num_gpus_per_learner_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.num_gpus_per_learner_worker", + new="AlgorithmConfig.num_gpus_per_learner", + error=False, + ) + self.num_gpus_per_learner = value + class TorchCompileWhatToCompile(str, Enum): """Enumerates schemes of what parts of the TorchLearner can be compiled. From 02d3d04cb323d2bbeb4edbf88db7def011351027 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 04:34:20 +0200 Subject: [PATCH 03/23] wip Signed-off-by: sven1977 --- rllib/examples/gpus/fractional_gpus.py | 136 ------------------ .../gpus/fractional_gpus_per_learner.py | 96 +++++++++++++ 2 files changed, 96 insertions(+), 136 deletions(-) delete mode 100644 rllib/examples/gpus/fractional_gpus.py create mode 100644 rllib/examples/gpus/fractional_gpus_per_learner.py diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py deleted file mode 100644 index 3895b4772e03..000000000000 --- a/rllib/examples/gpus/fractional_gpus.py +++ /dev/null @@ -1,136 +0,0 @@ -# TODO (sven): Move this example script into the new API stack. - -"""Example of a custom gym environment and model. Run this for a demo. - -This example shows: - - using a custom environment - - using a custom model - - using Tune for grid search - -You can visualize experiment results in ~/ray_results using TensorBoard. -""" -import argparse - -import ray -from ray import air, tune -from ray.air.constants import TRAINING_ITERATION -from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.rllib.utils.test_utils import check_learning_achieved -from ray.tune.registry import get_trainable_cls - -tf1, tf, tfv = try_import_tf() -torch, nn = try_import_torch() - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." -) -parser.add_argument( - "--framework", - choices=["tf", "tf2", "torch"], - default="torch", - help="The DL framework specifier.", -) -parser.add_argument("--num-gpus", type=float, default=0.5) -parser.add_argument("--num-workers", type=int, default=1) -parser.add_argument("--num-gpus-per-worker", type=float, default=0.0) -parser.add_argument("--num-envs-per-worker", type=int, default=1) -parser.add_argument( - "--as-test", - action="store_true", - help="Whether this script should be run as a test: --stop-reward must " - "be achieved within --stop-timesteps AND --stop-iters.", -) -parser.add_argument( - "--stop-iters", type=int, default=50, help="Number of iterations to train." -) -parser.add_argument( - "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." -) -parser.add_argument( - "--stop-reward", type=float, default=180.0, help="Reward at which we stop training." -) - -if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=4) - - # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), - # where ray was started using only one of these GPUs: - # $ ray start --num-gpus=1 --head - - # Note: A strange error could occur when using tf: - # "NotImplementedError: Cannot convert a symbolic Tensor - # (default_policy/cond/strided_slice:0) to a numpy array." - # In rllib/utils/exploration/random.py. - # Fix: Install numpy version 1.19.5. - - # Tested arg combinations (4 tune trials will be setup; see - # tune.grid_search over 4 learning rates below): - # - num_gpus=0.5 (2 tune trials should run in parallel). - # - num_gpus=0.3 (3 tune trials should run in parallel). - # - num_gpus=0.25 (4 tune trials should run in parallel) - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 - # -> 3 tune trials should run in parallel. - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 - # -> 2 tune trials should run in parallel. - # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 - # -> 1 tune trial should run in parallel. - - config = ( - get_trainable_cls(args.run) - .get_default_config() - # Setup the test env as one that requires a GPU, iff - # num_gpus_per_worker > 0. - .environment( - GPURequiringEnv if args.num_gpus_per_worker > 0.0 else "CartPole-v1" - ) - .framework(args.framework) - .resources( - # How many GPUs does the local worker (driver) need? For most algos, - # this is where the learning updates happen. - # Set this to > 1 for multi-GPU learning. - num_gpus=args.num_gpus, - # How many GPUs does each RolloutWorker (`num_workers`) need? - num_gpus_per_worker=args.num_gpus_per_worker, - ) - # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_env_runner`)? - .env_runners( - num_env_runners=args.num_workers, - # This setting should not really matter as it does not affect the - # number of GPUs reserved for each worker. - num_envs_per_env_runner=args.num_envs_per_worker, - ) - # 4 tune trials altogether. - .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) - ) - - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - # Note: The above GPU settings should also work in case you are not - # running via ``Tuner.fit()``, but instead do: - - # >> from ray.rllib.algorithms.ppo import PPO - # >> algo = PPO(config=config) - # >> for _ in range(10): - # >> results = algo.train() - # >> print(results) - - results = tune.Tuner( - args.run, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) - ).fit() - - if args.as_test: - check_learning_achieved(results, args.stop_reward) - ray.shutdown() diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py new file mode 100644 index 000000000000..5aa1eb8e1f81 --- /dev/null +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -0,0 +1,96 @@ +"""Example of using fractional GPUs (< 1.0) per Learner worker. + +This example: + - shows how to setup an Algorithm that uses one or more Learner workers ... + - ... and assigns a fractional (< 1.0) number of GPUs to each of these Learners. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-learner-workers= +[number of Learner workers, e.g. 1] --num-gpus-per-learner [some fraction <1.0]` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +You can visualize experiment results in ~/ray_results using TensorBoard. +""" +from ray import tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + +parser = add_rllib_example_script_args( + default_iters=50, default_reward=180, default_timesteps=100000 +) +parser.add_argument("--num-learners", type=int, default=1) +parser.add_argument("--num-gpus-per-learner", type=float, default=0.5) + + +if __name__ == "__main__": + args = parser.parse_args() + + # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), + # where ray was started using only one of these GPUs: + # $ ray start --num-gpus=1 --head + + # Tested arg combinations (4 tune trials will be setup; see + # tune.grid_search over 4 learning rates below): + # - num_gpus=0.5 (2 tune trials should run in parallel). + # - num_gpus=0.3 (3 tune trials should run in parallel). + # - num_gpus=0.25 (4 tune trials should run in parallel) + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 + # -> 3 tune trials should run in parallel. + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 + # -> 2 tune trials should run in parallel. + # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 + # -> 1 tune trial should run in parallel. + + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment("CartPole-v1") + .learners(num_learners=args.num_learners) + .resources( + num_learner_workers=args.num_learners, + # How many GPUs does the local worker (driver) need? For most algos, + # this is where the learning updates happen. + # Set this to > 1 for multi-GPU learning. + num_gpus=args.num_gpus, + # How many GPUs does each RolloutWorker (`num_workers`) need? + num_gpus_per_worker=args.num_gpus_per_worker, + ) + # 4 tune trials altogether. + .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + # Note: The above GPU settings should also work in case you are not + # running via ``Tuner.fit()``, but instead do: + + # >> from ray.rllib.algorithms.ppo import PPO + # >> algo = PPO(config=config) + # >> for _ in range(10): + # >> results = algo.train() + # >> print(results) + + run_rllib_example_script_experiment(base_config, args) From 3ada50af6f5c902668f0cdb708d9a542dce0b591 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 04:35:13 +0200 Subject: [PATCH 04/23] wip Signed-off-by: sven1977 --- rllib/examples/gpus/fractional_gpus.py | 136 ++++++++++++++++++ .../gpus/fractional_gpus_per_learner.py | 96 ------------- 2 files changed, 136 insertions(+), 96 deletions(-) create mode 100644 rllib/examples/gpus/fractional_gpus.py delete mode 100644 rllib/examples/gpus/fractional_gpus_per_learner.py diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py new file mode 100644 index 000000000000..3895b4772e03 --- /dev/null +++ b/rllib/examples/gpus/fractional_gpus.py @@ -0,0 +1,136 @@ +# TODO (sven): Move this example script into the new API stack. + +"""Example of a custom gym environment and model. Run this for a demo. + +This example shows: + - using a custom environment + - using a custom model + - using Tune for grid search + +You can visualize experiment results in ~/ray_results using TensorBoard. +""" +import argparse + +import ray +from ray import air, tune +from ray.air.constants import TRAINING_ITERATION +from ray.rllib.examples.envs.classes.gpu_requiring_env import GPURequiringEnv +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import check_learning_achieved +from ray.tune.registry import get_trainable_cls + +tf1, tf, tfv = try_import_tf() +torch, nn = try_import_torch() + +parser = argparse.ArgumentParser() +parser.add_argument( + "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use." +) +parser.add_argument( + "--framework", + choices=["tf", "tf2", "torch"], + default="torch", + help="The DL framework specifier.", +) +parser.add_argument("--num-gpus", type=float, default=0.5) +parser.add_argument("--num-workers", type=int, default=1) +parser.add_argument("--num-gpus-per-worker", type=float, default=0.0) +parser.add_argument("--num-envs-per-worker", type=int, default=1) +parser.add_argument( + "--as-test", + action="store_true", + help="Whether this script should be run as a test: --stop-reward must " + "be achieved within --stop-timesteps AND --stop-iters.", +) +parser.add_argument( + "--stop-iters", type=int, default=50, help="Number of iterations to train." +) +parser.add_argument( + "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train." +) +parser.add_argument( + "--stop-reward", type=float, default=180.0, help="Reward at which we stop training." +) + +if __name__ == "__main__": + args = parser.parse_args() + ray.init(num_cpus=4) + + # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), + # where ray was started using only one of these GPUs: + # $ ray start --num-gpus=1 --head + + # Note: A strange error could occur when using tf: + # "NotImplementedError: Cannot convert a symbolic Tensor + # (default_policy/cond/strided_slice:0) to a numpy array." + # In rllib/utils/exploration/random.py. + # Fix: Install numpy version 1.19.5. + + # Tested arg combinations (4 tune trials will be setup; see + # tune.grid_search over 4 learning rates below): + # - num_gpus=0.5 (2 tune trials should run in parallel). + # - num_gpus=0.3 (3 tune trials should run in parallel). + # - num_gpus=0.25 (4 tune trials should run in parallel) + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 + # -> 3 tune trials should run in parallel. + # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 + # -> 2 tune trials should run in parallel. + # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 + # -> 1 tune trial should run in parallel. + + config = ( + get_trainable_cls(args.run) + .get_default_config() + # Setup the test env as one that requires a GPU, iff + # num_gpus_per_worker > 0. + .environment( + GPURequiringEnv if args.num_gpus_per_worker > 0.0 else "CartPole-v1" + ) + .framework(args.framework) + .resources( + # How many GPUs does the local worker (driver) need? For most algos, + # this is where the learning updates happen. + # Set this to > 1 for multi-GPU learning. + num_gpus=args.num_gpus, + # How many GPUs does each RolloutWorker (`num_workers`) need? + num_gpus_per_worker=args.num_gpus_per_worker, + ) + # How many RolloutWorkers (each with n environment copies: + # `num_envs_per_env_runner`)? + .env_runners( + num_env_runners=args.num_workers, + # This setting should not really matter as it does not affect the + # number of GPUs reserved for each worker. + num_envs_per_env_runner=args.num_envs_per_worker, + ) + # 4 tune trials altogether. + .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) + ) + + stop = { + TRAINING_ITERATION: args.stop_iters, + NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, + } + + # Note: The above GPU settings should also work in case you are not + # running via ``Tuner.fit()``, but instead do: + + # >> from ray.rllib.algorithms.ppo import PPO + # >> algo = PPO(config=config) + # >> for _ in range(10): + # >> results = algo.train() + # >> print(results) + + results = tune.Tuner( + args.run, param_space=config.to_dict(), run_config=air.RunConfig(stop=stop) + ).fit() + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py deleted file mode 100644 index 5aa1eb8e1f81..000000000000 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Example of using fractional GPUs (< 1.0) per Learner worker. - -This example: - - shows how to setup an Algorithm that uses one or more Learner workers ... - - ... and assigns a fractional (< 1.0) number of GPUs to each of these Learners. - - -How to run this script ----------------------- -`python [script file name].py --enable-new-api-stack --num-learner-workers= -[number of Learner workers, e.g. 1] --num-gpus-per-learner [some fraction <1.0]` - -For debugging, use the following additional command line options -`--no-tune --num-env-runners=0` -which should allow you to set breakpoints anywhere in the RLlib code and -have the execution stop there for inspection and debugging. - -For logging to your WandB account, use: -`--wandb-key=[your WandB API key] --wandb-project=[some project name] ---wandb-run-name=[optional: WandB run name (within the defined project)]` - -You can visualize experiment results in ~/ray_results using TensorBoard. -""" -from ray import tune -from ray.air.constants import TRAINING_ITERATION -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls - -parser = add_rllib_example_script_args( - default_iters=50, default_reward=180, default_timesteps=100000 -) -parser.add_argument("--num-learners", type=int, default=1) -parser.add_argument("--num-gpus-per-learner", type=float, default=0.5) - - -if __name__ == "__main__": - args = parser.parse_args() - - # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), - # where ray was started using only one of these GPUs: - # $ ray start --num-gpus=1 --head - - # Tested arg combinations (4 tune trials will be setup; see - # tune.grid_search over 4 learning rates below): - # - num_gpus=0.5 (2 tune trials should run in parallel). - # - num_gpus=0.3 (3 tune trials should run in parallel). - # - num_gpus=0.25 (4 tune trials should run in parallel) - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 - # -> 3 tune trials should run in parallel. - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 - # -> 2 tune trials should run in parallel. - # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 - # -> 1 tune trial should run in parallel. - - base_config = ( - get_trainable_cls(args.algo) - .get_default_config() - .environment("CartPole-v1") - .learners(num_learners=args.num_learners) - .resources( - num_learner_workers=args.num_learners, - # How many GPUs does the local worker (driver) need? For most algos, - # this is where the learning updates happen. - # Set this to > 1 for multi-GPU learning. - num_gpus=args.num_gpus, - # How many GPUs does each RolloutWorker (`num_workers`) need? - num_gpus_per_worker=args.num_gpus_per_worker, - ) - # 4 tune trials altogether. - .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) - ) - - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - # Note: The above GPU settings should also work in case you are not - # running via ``Tuner.fit()``, but instead do: - - # >> from ray.rllib.algorithms.ppo import PPO - # >> algo = PPO(config=config) - # >> for _ in range(10): - # >> results = algo.train() - # >> print(results) - - run_rllib_example_script_experiment(base_config, args) From 496c5ee2deaa5be60496cae3f31d8a637f4423e3 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 05:05:59 +0200 Subject: [PATCH 05/23] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 53 +++++++++++++++---- .../backward_compat/test_backward_compat.py | 11 +++- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 8ac662b1171a..09e245589f5a 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -268,7 +268,7 @@ def __init__(self, algo_class: Optional[type] = None): self.placement_strategy = "PACK" self.num_gpus = 0 # @OldAPIStack self._fake_gpus = False # @OldAPIStack - self.num_cpus_for_local_worker = 1 + self.num_cpus_for_main_process = 1 # `self.framework()` self.framework_str = "torch" @@ -568,7 +568,7 @@ def to_dict(self) -> AlgorithmConfigDict: Returns: A complete AlgorithmConfigDict, usable in backward-compatible Tune/RLlib - use cases, e.g. w/ `tune.Tuner().fit()`. + use cases. """ config = copy.deepcopy(vars(self)) config.pop("algo_class") @@ -606,10 +606,17 @@ def to_dict(self) -> AlgorithmConfigDict: config["create_env_on_driver"] = config.pop("create_env_on_local_worker", 1) config["custom_eval_function"] = config.pop("custom_evaluation_function", None) config["framework"] = config.pop("framework_str", None) - config["num_cpus_for_driver"] = config.pop("num_cpus_for_local_worker", 1) + config["num_cpus_for_driver"] = config.pop( + "num_cpus_for_local_worker", config.pop("num_cpus_for_main_process", 1) + ) config["num_workers"] = config.pop( "num_env_runners", config.pop("num_rollout_workers", 0) ) + config["num_cpus_per_worker"] = config.pop("num_cpus_per_env_runner", 1) + config["num_gpus_per_worker"] = config.pop("num_gpus_per_env_runner", 0) + config["num_learner_workers"] = config.pop("num_learners", 0) + config["num_cpus_per_learner_worker"] = config.pop("num_cpus_per_learner", 1) + config["num_gpus_per_learner_worker"] = config.pop("num_gpus_per_learner", 0) # Simplify: Remove all deprecated keys that have as value `DEPRECATED_VALUE`. # These would be useless in the returned dict anyways. @@ -1209,9 +1216,9 @@ def python_environment( def resources( self, *, + num_cpus_for_main_process: Optional[int] = NotProvided, num_gpus: Optional[Union[float, int]] = NotProvided, # @OldAPIStack _fake_gpus: Optional[bool] = NotProvided, # @OldAPIStack - num_cpus_for_local_worker: Optional[int] = NotProvided, placement_strategy: Optional[str] = NotProvided, # Deprecated args. num_cpus_per_worker=DEPRECATED_VALUE, # moved to `env_runners` @@ -1220,11 +1227,16 @@ def resources( num_learner_workers=DEPRECATED_VALUE, # moved to `learners` num_cpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` num_gpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` - local_gpu_idx=DEPRECATED_VALUE # moved to `learners` + local_gpu_idx=DEPRECATED_VALUE, # moved to `learners` + num_cpus_for_local_worker = DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Specifies resources allocated for an Algorithm and its ray actors/workers. Args: + num_cpus_for_main_process: Number of CPUs to allocate for the main algorithm + process that runs `Algorithm.training_step()`. + Note: This is only relevant when running RLlib through Tune. Otherwise, + `Algorithm.training_step()` runs in the main program (driver). num_gpus: Number of GPUs to allocate to the algorithm process. Note that not all algorithms can take advantage of GPUs. Support for multi-GPU is currently only available for @@ -1233,9 +1245,6 @@ def resources( CPU machine. GPU towers will be simulated by graphs located on CPUs in this case. Use `num_gpus` to test for different numbers of fake GPUs. - num_cpus_for_local_worker: Number of CPUs to allocate for the algorithm. - Note: this only takes effect when running in Tune. Otherwise, - the algorithm runs in the main program (driver). placement_strategy: The strategy for the placement group factory returned by `Algorithm.default_resource_request()`. A PlacementGroup defines, which devices (resources) should always be co-located on the same node. @@ -1311,12 +1320,20 @@ def resources( ) self.local_gpu_idx = local_gpu_idx + if num_cpus_for_local_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.resources(num_cpus_for_local_worker)", + new="AlgorithmConfig.resources(num_cpus_for_main_process)", + error=False, + ) + self.num_cpus_for_main_process = num_cpus_for_local_worker + + if num_cpus_for_main_process is not NotProvided: + self.num_cpus_for_main_process = num_cpus_for_main_process if num_gpus is not NotProvided: self.num_gpus = num_gpus if _fake_gpus is not NotProvided: self._fake_gpus = _fake_gpus - if num_cpus_for_local_worker is not NotProvided: - self.num_cpus_for_local_worker = num_cpus_for_local_worker if placement_strategy is not NotProvided: self.placement_strategy = placement_strategy @@ -1944,7 +1961,7 @@ def learners( num_gpus_per_learner: Optional[Union[float, int]] = NotProvided, local_gpu_idx: Optional[int] = NotProvided, ): - """Sets Learner worker related configuration. + """Sets LearnerGroup and Learner worker related configurations. Args: num_learners: Number of Learner workers used for updating the RLModule. @@ -4804,6 +4821,20 @@ def num_gpus_per_learner_worker(self, value): ) self.num_gpus_per_learner = value + @property + @Deprecated(new="AlgorithmConfig.num_cpus_for_local_worker", error=False) + def num_cpus_for_local_worker(self): + return self.num_cpus_for_main_process + + @num_cpus_for_local_worker.setter + def num_cpus_for_local_worker(self, value): + deprecation_warning( + old="AlgorithmConfig.num_cpus_for_local_worker", + new="AlgorithmConfig.num_cpus_for_main_process", + error=False, + ) + self.num_cpus_for_main_process = value + class TorchCompileWhatToCompile(str, Enum): """Enumerates schemes of what parts of the TorchLearner can be compiled. diff --git a/rllib/tests/backward_compat/test_backward_compat.py b/rllib/tests/backward_compat/test_backward_compat.py index 5386aaf925b2..072685eff503 100644 --- a/rllib/tests/backward_compat/test_backward_compat.py +++ b/rllib/tests/backward_compat/test_backward_compat.py @@ -95,6 +95,11 @@ def test_old_algorithm_config_dicts(self): # Test, whether both keys (that map to the same new key) still work. "num_workers": 2, "num_rollout_workers": 2, + # Resource settings. + "num_cpus_for_local_worker": 2, + "num_cpus_per_learner_worker": 3, + "num_gpus_per_learner_worker": 4, + "num_learner_workers": 5, } config = AlgorithmConfig.from_dict(config_dict) self.assertFalse(config.in_evaluation) @@ -105,6 +110,10 @@ def test_old_algorithm_config_dicts(self): self.assertTrue(eval_config.in_evaluation) self.assertTrue(eval_config.lr == 0.1) self.assertTrue(config.num_env_runners == 2) + self.assertTrue(config.num_cpus_for_main_process == 2) + self.assertTrue(config.num_cpus_per_learner == 3) + self.assertTrue(config.num_gpus_per_learner == 4) + self.assertTrue(config.num_learners == 5) register_env( "test", @@ -121,7 +130,7 @@ def test_old_algorithm_config_dicts(self): "num_envs_per_worker": 4, # old key -> num_envs_per_env_runner "explore": False, }, - "evaluation_num_env_runners": 1, + "evaluation_num_workers": 1, # old key -> evaluation_num_env_runners "multiagent": { "policies": { "policy1": PolicySpec(), From 68aa7bafaa9f530d0ee9ce85967464e0435f5e6f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 06:01:27 +0200 Subject: [PATCH 06/23] wip Signed-off-by: sven1977 --- rllib/BUILD | 11 ++-- .../gpus/fractional_gpus_per_learner.py | 57 ++++++++++--------- rllib/utils/test_utils.py | 49 ++++++++++------ 3 files changed, 66 insertions(+), 51 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 7383987f24f7..3e628610d6ba 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2440,14 +2440,13 @@ py_test( # subdirectory: gpus/ # .................................... -#@OldAPIStack py_test( - name = "examples/gpus/fractional_gpus", - main = "examples/gpus/fractional_gpus.py", - tags = ["team:rllib", "exclusive", "examples"], + name = "examples/gpus/fractional_gpus_per_learner", + main = "examples/gpus/fractional_gpus_per_learner.py", + tags = ["team:rllib", "exclusive", "examples", "gpu"], size = "medium", - srcs = ["examples/gpus/fractional_gpus.py"], - args = ["--as-test", "--stop-reward=40.0", "--num-gpus=0", "--num-workers=0"] + srcs = ["examples/gpus/fractional_gpus_per_learner.py"], + args = ["--enable-new-api-stack", --as-test", "--stop-reward=40.0", "--num-gpus-per-learner=0.5", "--num-learners=2"] ) # subdirectory: hierarchical/ diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py index 5aa1eb8e1f81..b8a935b29c40 100644 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -1,8 +1,8 @@ """Example of using fractional GPUs (< 1.0) per Learner worker. This example: - - shows how to setup an Algorithm that uses one or more Learner workers ... - - ... and assigns a fractional (< 1.0) number of GPUs to each of these Learners. + - shows how to set up an Algorithm that uses one or more Learner workers ... + - ... and how to assign a fractional (< 1.0) number of GPUs to each of these Learners. How to run this script @@ -15,11 +15,17 @@ which should allow you to set breakpoints anywhere in the RLlib code and have the execution stop there for inspection and debugging. +Note that the shown GPU settings in this script also work in case you are not +running via tune, but instead are using the `--no-tune` command line option. + For logging to your WandB account, use: `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` You can visualize experiment results in ~/ray_results using TensorBoard. + + + """ from ray import tune from ray.air.constants import TRAINING_ITERATION @@ -37,6 +43,7 @@ parser = add_rllib_example_script_args( default_iters=50, default_reward=180, default_timesteps=100000 ) +# TODO (sven): Retire the currently supported --num-gpus in favor of --num-learners. parser.add_argument("--num-learners", type=int, default=1) parser.add_argument("--num-gpus-per-learner", type=float, default=0.5) @@ -44,6 +51,10 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), # where ray was started using only one of these GPUs: # $ ray start --num-gpus=1 --head @@ -63,34 +74,26 @@ base_config = ( get_trainable_cls(args.algo) .get_default_config() + # This script only works on the new API stack. + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) .environment("CartPole-v1") - .learners(num_learners=args.num_learners) - .resources( - num_learner_workers=args.num_learners, - # How many GPUs does the local worker (driver) need? For most algos, - # this is where the learning updates happen. - # Set this to > 1 for multi-GPU learning. - num_gpus=args.num_gpus, - # How many GPUs does each RolloutWorker (`num_workers`) need? - num_gpus_per_worker=args.num_gpus_per_worker, + # Define EnvRunner scaling. + .env_runners(num_env_runners=args.num_env_runners) + # Define Learner scaling. + .learners( + # How many Learner workers do we need? If you have more than 1 GPU, you + # should set this to the number of GPUs available. + num_learners=args.num_learners, + # How many GPUs does each Learner need? If you have more than 1 GPU or only + # one Learner, you should set this to 1, otherwise, set this to some + # fraction. + num_gpus_per_learner=args.num_gpus_per_learner, ) # 4 tune trials altogether. .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) ) - stop = { - TRAINING_ITERATION: args.stop_iters, - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": args.stop_reward, - } - - # Note: The above GPU settings should also work in case you are not - # running via ``Tuner.fit()``, but instead do: - - # >> from ray.rllib.algorithms.ppo import PPO - # >> algo = PPO(config=config) - # >> for _ in range(10): - # >> results = algo.train() - # >> print(results) - - run_rllib_example_script_experiment(base_config, args) + run_rllib_example_script_experiment(base_config, args, keep_config=True) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 099c20a9d088..d60fb7e0a3f1 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1296,6 +1296,7 @@ def run_rllib_example_script_experiment( success_metric: Optional[Dict] = None, trainable: Optional[Type] = None, tune_callbacks: Optional[List] = None, + keep_config: bool = False, ) -> Union[ResultDict, tune.result_grid.ResultGrid]: """Given an algorithm config and some command line args, runs an experiment. @@ -1346,6 +1347,11 @@ def run_rllib_example_script_experiment( tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner. In case `args.wandb_key` is provided, will append a WandB logger to this list. + keep_config: Set this to True, if you don't want this utility to change the + given `base_config` in any way and leave it as-is. This is helpful + for example script that want to demonstrate how to set those settings + that are usually taken care of automatically in this function (e.g. + `num_env_runners`). Returns: The last ResultDict from a --no-tune run OR the tune.Tuner.fit() @@ -1363,26 +1369,33 @@ def run_rllib_example_script_experiment( } # Enhance the `base_config`, based on provided `args`. - config = ( - # Set the framework. - base_config.framework(args.framework) - # Enable the new API stack? - .api_stack( - enable_rl_module_and_learner=args.enable_new_api_stack, - enable_env_runner_and_connector_v2=args.enable_new_api_stack, - ) - # Define EnvRunner/RolloutWorker scaling and behavior. - .env_runners(num_env_runners=args.num_env_runners) - # Define compute resources used. - .resources( + if keep_config: + config = base_config + else: + config = ( + # Set the framework. + base_config.framework(args.framework) + # Enable the new API stack? + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) + # Define EnvRunner/RolloutWorker scaling and behavior. + .env_runners(num_env_runners=args.num_env_runners) + # Define Learner scaling and behavior. + .learners( + num_learners=getattr(args, "num_learners", args.num_gpus), + num_gpus_per_learner=getattr( + args, + "num_gpus_per_learner", + 1 if torch.cuda.is_available() else 0, + ), + ) # Old stack. - num_gpus=0 if args.enable_new_api_stack else args.num_gpus, - # New stack. - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if torch.cuda.is_available() else 0, - num_cpus_for_local_worker=1, + .resources( + num_gpus=0 if args.enable_new_api_stack else args.num_gpus, + ) ) - ) # Run the experiment w/o Tune (directly operate on the RLlib Algorithm object). if args.no_tune: From 794b960c9630e7d1c3a50d06d1d39c30a8ed77fa Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 06:41:24 +0200 Subject: [PATCH 07/23] wip Signed-off-by: sven1977 --- doc/source/rllib/doc_code/new_api_stack.py | 51 +++++++++++-------- doc/source/rllib/package_ref/utils.rst | 1 - doc/source/rllib/rllib-learner.rst | 13 +++-- doc/source/rllib/rllib-torch2x.rst | 2 +- doc/source/rllib/rllib-training.rst | 8 +-- rllib/algorithms/algorithm.py | 43 ++++++++-------- rllib/algorithms/algorithm_config.py | 27 +++++----- .../dreamerv3/tests/test_dreamerv3.py | 8 +-- rllib/algorithms/impala/impala.py | 14 ++--- rllib/algorithms/tests/test_algorithm.py | 4 +- rllib/benchmarks/torch_compile/README.md | 2 +- .../run_ppo_with_inference_bm.py | 6 +-- rllib/core/learner/learner.py | 2 +- rllib/core/learner/learner_group.py | 26 +++++----- rllib/core/learner/learner_group_config.py | 2 +- rllib/core/learner/scaling_config.py | 2 +- .../core/learner/tests/test_learner_group.py | 24 +++------ rllib/env/env_runner_group.py | 8 +-- rllib/evaluation/rollout_worker.py | 2 +- ...e_envs_with_inference_done_on_main_node.py | 2 +- .../debugging/deterministic_training.py | 14 ++--- .../envs/classes/gpu_requiring_env.py | 2 +- rllib/examples/learners/ppo_tuner.py | 16 +++--- .../self_play_league_based_with_open_spiel.py | 8 +-- .../multi_agent/self_play_with_open_spiel.py | 8 +-- rllib/policy/policy.py | 4 +- .../tests/test_algorithm_rl_module_restore.py | 12 ++--- rllib/tests/test_custom_resource.py | 7 ++- rllib/tests/test_gpus.py | 10 ++-- rllib/tests/test_placement_groups.py | 3 +- ...benchmark_dqn_atari_rllib_preprocessing.py | 6 +-- rllib/tuned_examples/dreamerv3/atari_100k.py | 8 +-- rllib/tuned_examples/dreamerv3/atari_200M.py | 9 ++-- .../dreamerv3/dm_control_suite_vision.py | 8 +-- rllib/tuned_examples/dreamerv3/flappy_bird.py | 8 +-- .../dreamerv3/gymnasium_robotics.py | 13 ++--- rllib/tuned_examples/dreamerv3/highway_env.py | 8 +-- .../ppo/benchmark_ppo_mujoco.py | 6 +-- .../ppo/benchmark_ppo_mujoco_pb2.py | 6 +-- .../sac/benchmark_sac_mujoco.py | 6 +-- .../sac/benchmark_sac_mujoco_pb2.py | 6 +-- rllib/utils/error.py | 8 +-- rllib/utils/test_utils.py | 27 ++++++---- rllib/utils/torch_utils.py | 49 +----------------- 44 files changed, 238 insertions(+), 261 deletions(-) diff --git a/doc/source/rllib/doc_code/new_api_stack.py b/doc/source/rllib/doc_code/new_api_stack.py index fbe485c5b665..e3aec916ed55 100644 --- a/doc/source/rllib/doc_code/new_api_stack.py +++ b/doc/source/rllib/doc_code/new_api_stack.py @@ -4,7 +4,8 @@ config = ( - PPOConfig().environment("CartPole-v1") + PPOConfig() + .environment("CartPole-v1") # Switch both the new API stack flags to True (both False by default). # This enables the use of # a) RLModule (replaces ModelV2) and Learner (replaces Policy) @@ -14,14 +15,16 @@ enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, ) + .resources( + num_cpus_for_main_process=1, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set - # `num_learner_workers` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner_worker=1`). - .resources( - num_learner_workers=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner_worker=0, # <- set this to 1, if you have at least 1 GPU - num_cpus_for_local_worker=1, + # `num_learners` to the number of available GPUs for multi-GPU training (and + # `num_gpus_per_learner=1`). + .learners( + num_learners=0, # <- in most cases, set this value to the number of GPUs + num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU ) # When using RLlib's default models (RLModules) AND the new EnvRunners, you should # set this flag in your model config. Having to set this, will no longer be required @@ -46,7 +49,8 @@ # A typical multi-agent setup (otherwise using the exact same parameters as before) # looks like this. config = ( - PPOConfig().environment(MultiAgentCartPole, env_config={"num_agents": 2}) + PPOConfig() + .environment(MultiAgentCartPole, env_config={"num_agents": 2}) # Switch both the new API stack flags to True (both False by default). # This enables the use of # a) RLModule (replaces ModelV2) and Learner (replaces Policy) @@ -56,14 +60,16 @@ enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, ) + .resources( + num_cpus_for_main_process=1, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set - # `num_learner_workers` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner_worker=1`). - .resources( - num_learner_workers=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner_worker=0, # <- set this to 1, if you have at least 1 GPU - num_cpus_for_local_worker=1, + # `num_learners` to the number of available GPUs for multi-GPU training (and + # `num_gpus_per_learner=1`). + .learners( + num_learners=0, # <- in most cases, set this value to the number of GPUs + num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU ) # When using RLlib's default models (RLModules) AND the new EnvRunners, you should # set this flag in your model config. Having to set this, will no longer be required @@ -92,7 +98,8 @@ config = ( - SACConfig().environment("Pendulum-v1") + SACConfig() + .environment("Pendulum-v1") # Switch both the new API stack flags to True (both False by default). # This enables the use of # a) RLModule (replaces ModelV2) and Learner (replaces Policy) @@ -102,14 +109,16 @@ enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True, ) + .resources( + num_cpus_for_main_process=1, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set - # `num_learner_workers` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner_worker=1`). - .resources( - num_learner_workers=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner_worker=0, # <- set this to 1, if you have at least 1 GPU - num_cpus_for_local_worker=1, + # `num_learners` to the number of available GPUs for multi-GPU training (and + # `num_gpus_per_learner=1`). + .learners( + num_learners=0, # <- in most cases, set this value to the number of GPUs + num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU ) # When using RLlib's default models (RLModules) AND the new EnvRunners, you should # set this flag in your model config. Having to set this, will no longer be required diff --git a/doc/source/rllib/package_ref/utils.rst b/doc/source/rllib/package_ref/utils.rst index 9405ff9fc276..3260fc774654 100644 --- a/doc/source/rllib/package_ref/utils.rst +++ b/doc/source/rllib/package_ref/utils.rst @@ -181,7 +181,6 @@ Torch utilities ~convert_to_torch_tensor ~explained_variance ~flatten_inputs_to_1d_tensor - ~get_device ~global_norm ~huber_loss ~l2_loss diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 218eae98bc98..b65a6f624950 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -46,7 +46,7 @@ Enabling Learner API in RLlib experiments ========================================= Adjust the amount of resources for training using the -`num_gpus_per_learner_worker`, `num_cpus_per_learner_worker`, and `num_learner_workers` +`num_gpus_per_learner`, `num_cpus_per_learner`, and `num_learners` arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig`. .. testcode:: @@ -59,11 +59,10 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf config = ( PPOConfig() .api_stack(enable_rl_module_and_learner=True) - .resources( - num_gpus_per_learner_worker=0, # Set this to 1 to enable GPU training. - num_cpus_per_learner_worker=1, - num_learner_workers=0 # Set this to greater than 1 to allow for DDP style - # updates. + .learners( + num_learners=0 # Set this to greater than 1 to allow for DDP style updates. + num_gpus_per_learner=0, # Set this to 1 to enable GPU training. + num_cpus_per_learner=1, ) ) @@ -136,7 +135,7 @@ and :py:class:`~ray.rllib.core.learner.learner.Learner` APIs via the :py:class:` # Number of Learner workers (ray actors). # Use 0 for no actors, only create a local Learner. # Use >=1 to create n DDP-style Learner workers (ray actors). - .resources(num_learner_workers=1) + .learners(num_learners=1) # Specify the learner's hyperparameters. .training( use_kl_loss=True, diff --git a/doc/source/rllib/rllib-torch2x.rst b/doc/source/rllib/rllib-torch2x.rst index c6f6e056edff..ec3e50bf934d 100644 --- a/doc/source/rllib/rllib-torch2x.rst +++ b/doc/source/rllib/rllib-torch2x.rst @@ -80,7 +80,7 @@ Some meta-level comments Exploration ------------ -In RLlib, you can now set the configuration so that it uses the compiled module during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it is recommended to use the ``ipex`` or ``onnxrt`` backend. However, you can still run the sampling part on GPUs as well by setting ``num_gpus_per_worker`` in which case other backends can be used as well. For enabling torch-compile during training you can also set `torch_compile_learner` equivalents. +In RLlib, you can now set the configuration so that it uses the compiled module during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it is recommended to use the ``ipex`` or ``onnxrt`` backend. However, you can still run the sampling part on GPUs as well by setting ``num_gpus_per_env_runner`` in which case other backends can be used as well. For enabling torch-compile during training you can also set `torch_compile_learner` equivalents. diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index d6a2c1e8f249..6bcf8672f276 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -444,13 +444,13 @@ and 5 remote workers (responsible for sample collection). Since learning is most of the time done on the local worker, it may help to provide one or more GPUs to that worker via the ``num_gpus`` setting. -Similarly, the resource allocation to remote workers can be controlled via ``num_cpus_per_worker``, ``num_gpus_per_worker``, and ``custom_resources_per_worker``. +Similarly, the resource allocation to remote workers can be controlled via ``num_cpus_per_env_runner``, ``num_gpus_per_env_runner``, and ``custom_resources_per_env_runner``. The number of GPUs can be fractional quantities (e.g. 0.5) to allocate only a fraction of a GPU. For example, with DQN you can pack five algorithms onto one GPU by setting ``num_gpus: 0.2``. Check out `this fractional GPU example here `__ as well that also demonstrates how environments (running on the remote workers) that -require a GPU can benefit from the ``num_gpus_per_worker`` setting. +require a GPU can benefit from the ``num_gpus_per_env_runner`` setting. For synchronous algorithms like PPO and A2C, the driver and workers can make use of the same GPU. To do this for an amount of ``n`` GPUS: @@ -459,7 +459,7 @@ the same GPU. To do this for an amount of ``n`` GPUS: gpu_count = n num_gpus = 0.0001 # Driver GPU - num_gpus_per_worker = (gpu_count - num_gpus) / num_env_runners + num_gpus_per_env_runner = (gpu_count - num_gpus) / num_env_runners .. Original image: https://docs.google.com/drawings/d/14QINFvx3grVyJyjAnjggOCEVN-Iq6pYVJ3jA2S6j8z0/edit?usp=sharing .. image:: images/rllib-config.svg @@ -496,7 +496,7 @@ These can be scaled by increasing ``num_env_runners`` to add rollout workers. It inference. Make sure to set ``num_gpus: 1`` if you want to use a GPU. If the learner becomes a bottleneck, multiple GPUs can be used for learning by setting ``num_gpus > 1``. -3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_worker: 1``. If you only have a single GPU, consider ``num_env_runners: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker `__. +3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_env_runner: 1``. If you only have a single GPU, consider ``num_env_runners: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker `__. 4. Finally, if both model and environment are compute intensive, then enable `remote worker envs `__ with `async batching `__ by setting ``remote_worker_envs: True`` and optionally ``remote_env_batch_wait_ms``. This batches inference on GPUs in the rollout workers while letting envs run asynchronously in separate actors, similar to the `SEED `__ architecture. The number of workers and number of envs per worker should be tuned to maximize GPU utilization. diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index c02ba33ed49a..67630a41e9ab 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -180,15 +180,14 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]: A list of resource bundles for the learner workers. """ if cf.num_learner_workers > 0: - if cf.num_gpus_per_learner_worker: + if cf.num_gpus_per_learner: learner_bundles = [ - {"GPU": cf.num_learner_workers * cf.num_gpus_per_learner_worker} + {"GPU": cf.num_learner_workers * cf.num_gpus_per_learner} ] - elif cf.num_cpus_per_learner_worker: + elif cf.num_cpus_per_learner: learner_bundles = [ { - "CPU": cf.num_cpus_per_learner_worker - * cf.num_learner_workers, + "CPU": cf.num_cpus_per_learner * cf.num_learner_workers, } ] else: @@ -197,9 +196,9 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]: # sampling and training is not done concurrently when local is # used, so pick the max. "CPU": max( - cf.num_cpus_per_learner_worker, cf.num_cpus_for_local_worker + cf.num_cpus_per_learner, cf.num_cpus_for_main_process ), - "GPU": cf.num_gpus_per_learner_worker, + "GPU": cf.num_gpus_per_learner, } ] return learner_bundles @@ -253,6 +252,7 @@ class Algorithm(Trainable, AlgorithmBase): "env_config", "model", "optimizer", + "custom_resources_per_env_runner", "custom_resources_per_worker", "evaluation_config", "exploration_config", @@ -2481,9 +2481,9 @@ def default_resource_request( ) -> Union[Resources, PlacementGroupFactory]: # Default logic for RLlib Algorithms: # Create one bundle per individual worker (local or remote). - # Use `num_cpus_for_local_worker` and `num_gpus` for the local worker and - # `num_cpus_per_worker` and `num_gpus_per_worker` for the remote - # workers to determine their CPU/GPU resource needs. + # Use `num_cpus_for_main_process` and `num_gpus` for the local worker and + # `num_cpus_per_env_runner` and `num_gpus_per_env_runner` for the remote + # EnvRunners to determine their CPU/GPU resource needs. # Convenience config handles. cf = cls.get_default_config().update_from_dict(config) @@ -2504,19 +2504,19 @@ def default_resource_request( else: # in this case local_worker only does sampling and training is done on # remote learner workers - driver = {"CPU": cf.num_cpus_for_local_worker, "GPU": 0} + driver = {"CPU": cf.num_cpus_for_main_process, "GPU": 0} else: driver = { - "CPU": cf.num_cpus_for_local_worker, + "CPU": cf.num_cpus_for_main_process, "GPU": 0 if cf._fake_gpus else cf.num_gpus, } # resources for remote rollout env samplers rollout_bundles = [ { - "CPU": cf.num_cpus_per_worker, - "GPU": cf.num_gpus_per_worker, - **cf.custom_resources_per_worker, + "CPU": cf.num_cpus_per_env_runner, + "GPU": cf.num_gpus_per_env_runner, + **cf.custom_resources_per_env_runner, } for _ in range(cf.num_env_runners) ] @@ -2527,9 +2527,9 @@ def default_resource_request( # Note: The local eval worker is located on the driver CPU. evaluation_bundles = [ { - "CPU": eval_cf.num_cpus_per_worker, - "GPU": eval_cf.num_gpus_per_worker, - **eval_cf.custom_resources_per_worker, + "CPU": eval_cf.num_cpus_per_env_runner, + "GPU": eval_cf.num_gpus_per_env_runner, + **eval_cf.custom_resources_per_env_runner, } for _ in range(eval_cf.evaluation_num_env_runners) ] @@ -2694,9 +2694,10 @@ def _sync_filters_if_needed( def resource_help(cls, config: Union[AlgorithmConfig, AlgorithmConfigDict]) -> str: return ( "\n\nYou can adjust the resource requests of RLlib Algorithms by calling " - "`AlgorithmConfig.resources(" - "num_gpus=.., num_cpus_per_worker=.., num_gpus_per_worker=.., ..)` or " - "`AgorithmConfig.env_runners(num_env_runners=..)`. See " + "`AlgorithmConfig.env_runners(" + "num_env_runners=.., num_cpus_per_env_runner=.., " + "num_gpus_per_env_runner=.., ..)` and " + "`AgorithmConfig.learners(num_learners=.., num_gpus_per_learner=..)`. See " "the `ray.rllib.algorithms.algorithm_config.AlgorithmConfig` classes " "(each Algorithm has its own subclass of this class) for more info.\n\n" f"The config of this Algorithm is: {config}" diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 09e245589f5a..726a1789d409 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1228,7 +1228,7 @@ def resources( num_cpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` num_gpus_per_learner_worker=DEPRECATED_VALUE, # moved to `learners` local_gpu_idx=DEPRECATED_VALUE, # moved to `learners` - num_cpus_for_local_worker = DEPRECATED_VALUE, + num_cpus_for_local_worker=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Specifies resources allocated for an Algorithm and its ray actors/workers. @@ -2432,7 +2432,7 @@ def offline_data( raise ValueError( msg.format( "num_cpus_per_read_task", - "config.resources(num_cpus_per_worker=..)", + "config.env_runners(num_cpus_per_env_runner=..)", ) ) if input_config.get("parallelism") is not None: @@ -4013,16 +4013,13 @@ def _validate_resources_settings(self): # TODO @Avnishn: This is a short-term work around due to # https://github.com/ray-project/ray/issues/35409 # Remove this once we are able to specify placement group bundle index in RLlib - if ( - self.num_cpus_per_learner_worker > 1 - and self.num_gpus_per_learner > 0 - ): + if self.num_cpus_per_learner > 1 and self.num_gpus_per_learner > 0: raise ValueError( - "Cannot set both `num_cpus_per_learner_worker` > 1 and " + "Cannot set both `num_cpus_per_learner` > 1 and " " `num_gpus_per_learner` > 0! Either set " - "`num_cpus_per_learner_worker` > 1 (and `num_gpus_per_learner`" + "`num_cpus_per_learner` > 1 (and `num_gpus_per_learner`" "=0) OR set `num_gpus_per_learner` > 0 (and leave " - "`num_cpus_per_learner_worker` at its default value of 1). " + "`num_cpus_per_learner` at its default value of 1). " "This is due to issues with placement group fragmentation. See " "https://github.com/ray-project/ray/issues/35409 for more details." ) @@ -4030,8 +4027,8 @@ def _validate_resources_settings(self): # Make sure the resource requirements for learner_group is valid. if self.num_learners == 0 and self.num_gpus_per_env_runner > 1: raise ValueError( - "num_gpus_per_worker must be 0 (cpu) or 1 (gpu) when using local mode " - "(i.e. num_learners = 0)" + "num_gpus_per_env_runner must be 0 (cpu) or 1 (gpu) when using local " + "mode (i.e. `num_learners=0`)" ) def _validate_multi_agent_settings(self): @@ -4138,12 +4135,12 @@ def _validate_input_settings(self): if self.input_ == "dataset": # If we need to read a ray dataset set the parallelism and # num_cpus_per_read_task from rollout worker settings - self.input_config["num_cpus_per_read_task"] = self.num_cpus_per_worker + self.input_config["num_cpus_per_read_task"] = self.num_cpus_per_env_runner if self.in_evaluation: # If using dataset for evaluation, the parallelism gets set to # evaluation_num_env_runners for backward compatibility and num_cpus - # gets set to num_cpus_per_worker from rollout worker. User only needs - # to set evaluation_num_env_runners. + # gets set to num_cpus_per_env_runner from rollout worker. User only + # needs to set evaluation_num_env_runners. self.input_config["parallelism"] = self.evaluation_num_env_runners or 1 else: # If using dataset for training, the parallelism and num_cpus gets set @@ -4439,7 +4436,7 @@ def _translate_special_keys(key: str, warn_deprecated: bool = True) -> str: elif key == "lambda": key = "lambda_" elif key == "num_cpus_for_driver": - key = "num_cpus_for_local_worker" + key = "num_cpus_for_main_process" elif key == "num_workers": key = "num_env_runners" diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py index 11a75f651de7..92bb33dda483 100644 --- a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py +++ b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py @@ -50,10 +50,10 @@ def test_dreamerv3_compilation(self): symlog_obs=True, use_float16=False, ) - .resources( - num_learner_workers=2, # Try with 2 Learners. - num_cpus_per_learner_worker=1, - num_gpus_per_learner_worker=0, + .learners( + num_learners=2, # Try with 2 Learners. + num_cpus_per_learner=1, + num_gpus_per_learner=0, ) ) diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 8d8e668bd8f2..684c24971148 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -792,16 +792,16 @@ def default_resource_request( # from RolloutWorkers (n rollout workers map to m # aggregation workers, where m < n) and always use 1 CPU # each. - "CPU": cf.num_cpus_for_local_worker + cf.num_aggregation_workers, + "CPU": cf.num_cpus_for_main_process + cf.num_aggregation_workers, "GPU": 0 if cf._fake_gpus else cf.num_gpus, } ] + [ { # RolloutWorkers. - "CPU": cf.num_cpus_per_worker, - "GPU": cf.num_gpus_per_worker, - **cf.custom_resources_per_worker, + "CPU": cf.num_cpus_per_env_runner, + "GPU": cf.num_gpus_per_env_runner, + **cf.custom_resources_per_env_runner, } for _ in range(cf.num_env_runners) ] @@ -811,9 +811,9 @@ def default_resource_request( # Evaluation (remote) workers. # Note: The local eval worker is located on the driver # CPU or not even created iff >0 eval workers. - "CPU": eval_config.num_cpus_per_worker, - "GPU": eval_config.num_gpus_per_worker, - **eval_config.custom_resources_per_worker, + "CPU": eval_config.num_cpus_per_env_runner, + "GPU": eval_config.num_gpus_per_env_runner, + **eval_config.custom_resources_per_env_runner, } for _ in range(cf.evaluation_num_env_runners) ] diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index e5b853a8c974..d7ee2e570be4 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -42,7 +42,7 @@ def test_add_delete_policy(self): }, }, ) - .resources(num_cpus_per_worker=0.1) + .env_runners(num_cpus_per_env_runner=0.1) .training( train_batch_size=100, sgd_minibatch_size=50, @@ -62,7 +62,7 @@ def test_add_delete_policy(self): ) .evaluation( evaluation_num_env_runners=1, - evaluation_config=ppo.PPOConfig.overrides(num_cpus_per_worker=0.1), + evaluation_config=ppo.PPOConfig.overrides(num_cpus_per_env_runner=0.1), ) ) diff --git a/rllib/benchmarks/torch_compile/README.md b/rllib/benchmarks/torch_compile/README.md index b449d882939d..f0216790ec7c 100644 --- a/rllib/benchmarks/torch_compile/README.md +++ b/rllib/benchmarks/torch_compile/README.md @@ -42,7 +42,7 @@ For detailed benchmarks, checkout [this google doc](https://docs.google.com/spre ## Exploration -In RLlib, you can now set the configuration so that the compiled module is used during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it is recommended to use the `ipex` or `onnxrt` backend. Having said that, you can still choose to run the sampling part on GPUs as well by setting `num_gpus_per_worker` in which case other backends can be used as well. +In RLlib, you can now set the configuration so that the compiled module is used during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it is recommended to use the `ipex` or `onnxrt` backend. Having said that, you can still choose to run the sampling part on GPUs as well by setting `num_gpus_per_env_runner` in which case other backends can be used as well. ``` diff --git a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py index 4c87a32e95ae..a941f66deff1 100644 --- a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py +++ b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py @@ -64,9 +64,9 @@ def main(pargs): torch_compile_worker_dynamo_backend=pargs.backend, torch_compile_worker_dynamo_mode=pargs.mode, ) - .resources( - num_learner_workers=1, - num_gpus_per_learner_worker=0 if pargs.smoke_test else 1, + .learners( + num_learners=1, + num_gpus_per_learner=0 if pargs.smoke_test else 1, ) ) diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 5b742a5d1b60..47af584723a9 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -255,7 +255,7 @@ def __init__( update_global_seed_if_necessary(self.framework, self.config.seed) self._distributed = self.config.num_learner_workers > 1 - self._use_gpu = self.config.num_gpus_per_learner_worker > 0 + self._use_gpu = self.config.num_gpus_per_learner > 0 # If we are using gpu but we are not distributed, use this gpu for training. self._local_gpu_idx = self.config.local_gpu_idx diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index aadcb50f703c..69a58c19b9fd 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -88,8 +88,8 @@ def __init__( config: The AlgorithmConfig object to use to configure this LearnerGroup. Call the `resources(num_learner_workers=...)` method on your config to specify the number of learner workers to use. - Call the same method with arguments `num_cpus_per_learner_worker` and/or - `num_gpus_per_learner_worker` to configure the compute used by each + Call the same method with arguments `num_cpus_per_learner` and/or + `num_gpus_per_learner` to configure the compute used by each Learner worker in this LearnerGroup. Call the `training(learner_class=...)` method on your config to specify, which exact Learner class to use. @@ -140,25 +140,25 @@ def __init__( else: backend_config = _get_backend_config(learner_class) - # TODO (sven): Cannot set both `num_cpus_per_learner_worker`>1 and - # `num_gpus_per_learner_worker`>0! Users must set one or the other due + # TODO (sven): Cannot set both `num_cpus_per_learner`>1 and + # `num_gpus_per_learner`>0! Users must set one or the other due # to issues with placement group fragmentation. See # https://github.com/ray-project/ray/issues/35409 for more details. - num_cpus_per_worker = ( - self.config.num_cpus_per_learner_worker - if not self.config.num_gpus_per_learner_worker + num_cpus_per_learner = ( + self.config.num_cpus_per_learner + if not self.config.num_gpus_per_learner else 0 ) - num_gpus_per_worker = self.config.num_gpus_per_learner_worker - resources_per_worker = { - "CPU": num_cpus_per_worker, - "GPU": num_gpus_per_worker, + num_gpus_per_learner = self.config.num_gpus_per_learner + resources_per_learner = { + "CPU": num_cpus_per_learner, + "GPU": num_gpus_per_learner, } backend_executor = BackendExecutor( backend_config=backend_config, - num_workers=self.config.num_learner_workers, - resources_per_worker=resources_per_worker, + num_workers=self.config.num_learners, + resources_per_worker=resources_per_learner, max_retries=0, ) backend_executor.start( diff --git a/rllib/core/learner/learner_group_config.py b/rllib/core/learner/learner_group_config.py index 42bfc280e776..4d829e810e90 100644 --- a/rllib/core/learner/learner_group_config.py +++ b/rllib/core/learner/learner_group_config.py @@ -8,7 +8,7 @@ def __init_subclass__(cls, **kwargs): "`LearnerGroupConfig` has been replaced by the `AlgorithmConfig` object of " "your experiment. All information that used to be inside " "`LearnerGroupConfig` is already available inside an `AlgorithmConfig` " - "object (e.g. `num_learner_workers` or `num_gpus_per_learner_worker`). " + "object (e.g. `num_learners` or `num_gpus_per_learner`). " "You can build a LearnerGroup directly using an `AlgorithmConfig` via: " "`config.build_learner_group(env=..., spaces=..., rl_module_spec=...)`." ) diff --git a/rllib/core/learner/scaling_config.py b/rllib/core/learner/scaling_config.py index 48d94fbef7bb..b307fb6cfb13 100644 --- a/rllib/core/learner/scaling_config.py +++ b/rllib/core/learner/scaling_config.py @@ -9,7 +9,7 @@ def __init_subclass__(cls, **kwargs): "object of your experiment. All information that used to be inside " "`LearnerGroupScalingConfig` is already available inside an " "`AlgorithmConfig` object (e.g. `num_learner_workers` or " - "`num_gpus_per_learner_worker`). You can build a LearnerGroup directly " + "`num_gpus_per_learner`). You can build a LearnerGroup directly " "using an `AlgorithmConfig` via: `config.build_learner_group(env=..., " "spaces=..., rl_module_spec=...)`.", ) diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py index 0bbbd0d3edc4..2538c20a0d31 100644 --- a/rllib/core/learner/tests/test_learner_group.py +++ b/rllib/core/learner/tests/test_learner_group.py @@ -27,29 +27,19 @@ REMOTE_CONFIGS = { - "remote-cpu": AlgorithmConfig.overrides(num_learner_workers=1), - "remote-gpu": AlgorithmConfig.overrides( - num_learner_workers=1, num_gpus_per_learner_worker=1 - ), - "multi-gpu-ddp": AlgorithmConfig.overrides( - num_learner_workers=2, num_gpus_per_learner_worker=1 - ), - "multi-cpu-ddp": AlgorithmConfig.overrides( - num_learner_workers=2, num_cpus_per_learner_worker=2 - ), + "remote-cpu": AlgorithmConfig.overrides(num_learners=1), + "remote-gpu": AlgorithmConfig.overrides(num_learners=1, num_gpus_per_learner=1), + "multi-gpu-ddp": AlgorithmConfig.overrides(num_learners=2, num_gpus_per_learner=1), + "multi-cpu-ddp": AlgorithmConfig.overrides(num_learners=2, num_cpus_per_learner=2), # "multi-gpu-ddp-pipeline": AlgorithmConfig.overrides( - # num_learner_workers=2, num_gpus_per_learner_worker=2 + # num_learners=2, num_gpus_per_learner=2 # ), } LOCAL_CONFIGS = { - "local-cpu": AlgorithmConfig.overrides( - num_learner_workers=0, num_gpus_per_learner_worker=0 - ), - "local-gpu": AlgorithmConfig.overrides( - num_learner_workers=0, num_gpus_per_learner_worker=1 - ), + "local-cpu": AlgorithmConfig.overrides(num_learners=0, num_gpus_per_learner=0), + "local-gpu": AlgorithmConfig.overrides(num_learners=0, num_gpus_per_learner=1), } diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 5da468342871..e7c1c9ec15af 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -127,10 +127,10 @@ def __init__( self._policy_class = default_policy_class self._remote_config = config self._remote_args = { - "num_cpus": self._remote_config.num_cpus_per_worker, - "num_gpus": self._remote_config.num_gpus_per_worker, - "resources": self._remote_config.custom_resources_per_worker, - "max_restarts": config.max_num_worker_restarts, + "num_cpus": self._remote_config.num_cpus_per_env_runner, + "num_gpus": self._remote_config.num_gpus_per_env_runner, + "resources": self._remote_config.custom_resources_per_env_runner, + "max_restarts": config.max_num_env_runner_restarts, } # Set the EnvRunner subclass to be used as "workers". Default: RolloutWorker. diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 9b50a4100d0c..168c45c562e6 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -488,7 +488,7 @@ def wrap(env): num_gpus = ( self.config.num_gpus if self.worker_index == 0 - else self.config.num_gpus_per_worker + else self.config.num_gpus_per_env_runner ) # This is only for the old API where local_worker was responsible for learning diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index 91cfb23e2ff3..be4137c6aa95 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -146,7 +146,7 @@ def default_resource_request( num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), # Set the number of CPUs used by the (local) worker, aka "driver" # to match the number of Ray remote envs. - num_cpus_for_local_worker=args.num_envs_per_worker + 1, + num_cpus_for_main_process=args.num_envs_per_worker + 1, ) ) diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index 4bc991fcd7a3..ce42bc378235 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -25,7 +25,7 @@ parser.add_argument("--as-test", action="store_true") parser.add_argument("--stop-iters", type=int, default=2) parser.add_argument("--num-gpus", type=float, default=0) -parser.add_argument("--num-gpus-per-worker", type=float, default=0) +parser.add_argument("--num-gpus-per-env-runner", type=float, default=0) if __name__ == "__main__": args = parser.parse_args() @@ -44,14 +44,16 @@ num_env_runners=1, num_envs_per_env_runner=2, rollout_fragment_length=50, + num_gpus_per_env_runner=args.num_gpus_per_env_runner, ) + # The new Learner API. + .learners( + num_learners=int(args.num_gpus), + num_gpus_per_learner=int(args.num_gpus > 0), + ) + # Old gpu-training API. .resources( - num_gpus_per_worker=args.num_gpus_per_worker, - # Old gpu-training API num_gpus=args.num_gpus, - # The new Learner API - num_learner_workers=int(args.num_gpus), - num_gpus_per_learner_worker=int(args.num_gpus > 0), ) # Make sure every environment gets a fixed seed. .debugging(seed=args.seed) diff --git a/rllib/examples/envs/classes/gpu_requiring_env.py b/rllib/examples/envs/classes/gpu_requiring_env.py index 9e2032405fd0..42f835c3a8c3 100644 --- a/rllib/examples/envs/classes/gpu_requiring_env.py +++ b/rllib/examples/envs/classes/gpu_requiring_env.py @@ -9,7 +9,7 @@ class GPURequiringEnv(SimpleCorridor): check in its constructor via `ray.get_gpu_ids()`. If this returns an empty list, we raise an error. - To make this env work, use `num_gpus_per_worker > 0` (RolloutWorkers + To make this env work, use `num_gpus_per_env_runner > 0` (RolloutWorkers requesting this many GPUs each) and - maybe - `num_gpus > 0` in case your local worker/driver must have an env as well. However, this is only the case if `create_env_on_driver`=True (default is False). diff --git a/rllib/examples/learners/ppo_tuner.py b/rllib/examples/learners/ppo_tuner.py index ff5de37e9b9e..a27e292b9efa 100644 --- a/rllib/examples/learners/ppo_tuner.py +++ b/rllib/examples/learners/ppo_tuner.py @@ -5,15 +5,15 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.ppo import PPOConfig -RESOURCE_CONFIG = { - "remote-cpu": {"num_learner_workers": 1}, - "remote-gpu": {"num_learner_workers": 1, "num_gpus_per_learner_worker": 1}, +LEARNER_CONFIG = { + "remote-cpu": {"num_learners": 1}, + "remote-gpu": {"num_learners": 1, "num_gpus_per_learner": 1}, "multi-gpu-ddp": { - "num_learner_workers": 2, - "num_gpus_per_learner_worker": 1, + "num_learners": 2, + "num_gpus_per_learner": 1, }, "local-cpu": {}, - "local-gpu": {"num_gpus_per_learner_worker": 1}, + "local-gpu": {"num_gpus_per_learner": 1}, } @@ -44,10 +44,10 @@ def _parse_args(): PPOConfig() .framework(args.framework) .environment("CartPole-v1") - .resources(**RESOURCE_CONFIG[args.config]) + .learners(**LEARNER_CONFIG[args.config]) ) - print("Testing with resource config: ", RESOURCE_CONFIG[args.config]) + print("Testing with learner config: ", LEARNER_CONFIG[args.config]) print("Testing with framework: ", args.framework) print("-" * 80) tuner = tune.Tuner( diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index e489d586e4a1..65f45e73f151 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -182,10 +182,12 @@ def _get_multi_agent(): num_env_runners=args.num_env_runners, num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) + .learners( + num_learners=args.num_gpus, + num_gpus_per_learner=1 if args.num_gpus else 0, + ) .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, ) .training( num_sgd_iter=20, diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index c61b59e2ca5f..40390b54efcf 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -130,10 +130,12 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): num_env_runners=args.num_env_runners, num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) + .learners( + num_learners=args.num_gpus, + num_gpus_per_learner=1 if args.num_gpus else 0, + ) .resources( - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if args.num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, ) .multi_agent( # Initial policy map: Random and default algo one. This will be expanded diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 4b348771ec8b..da7b795aef07 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1253,7 +1253,7 @@ def _get_num_gpus_for_policy(self) -> int: num_gpus = 0 elif worker_idx == 0: # If we are on the new RLModule/Learner stack, `num_gpus` is deprecated. - # so use `num_gpus_per_worker` for policy sampling + # so use `num_gpus_per_env_runner` for policy sampling # we need this .get() syntax here to ensure backwards compatibility. if self.config.get("enable_rl_module_and_learner", False): num_gpus = self.config["num_gpus_per_worker"] @@ -1261,7 +1261,7 @@ def _get_num_gpus_for_policy(self) -> int: # If head node, take num_gpus. num_gpus = self.config["num_gpus"] else: - # If worker node, take num_gpus_per_worker + # If worker node, take `num_gpus_per_env_runner`. num_gpus = self.config["num_gpus_per_worker"] if num_gpus == 0: diff --git a/rllib/tests/test_algorithm_rl_module_restore.py b/rllib/tests/test_algorithm_rl_module_restore.py index fcedba8b6fd3..8a8dbf8e8b10 100644 --- a/rllib/tests/test_algorithm_rl_module_restore.py +++ b/rllib/tests/test_algorithm_rl_module_restore.py @@ -42,8 +42,8 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): return pol_id scaling_config = { - "num_learner_workers": 0, - "num_gpus_per_learner_worker": 0, + "num_learners": 0, + "num_gpus_per_learner": 0, } policies = {f"policy_{i}" for i in range(num_agents)} @@ -52,10 +52,10 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): PPOConfig() .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) + .learners(**scaling_config) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .resources(**scaling_config) ) return config @@ -182,17 +182,17 @@ def test_e2e_load_complex_marl_module(self): def test_e2e_load_rl_module(self): """Test if we can train a PPO algorithm with a cpkt RL module e2e.""" scaling_config = { - "num_learner_workers": 0, - "num_gpus_per_learner_worker": 0, + "num_learners": 0, + "num_gpus_per_learner": 0, } config = ( PPOConfig() .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) + .learners(**scaling_config) .environment("CartPole-v1") .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) - .resources(**scaling_config) ) env = gym.make("CartPole-v1") for fw in framework_iterator(config, frameworks=["tf2", "torch"]): diff --git a/rllib/tests/test_custom_resource.py b/rllib/tests/test_custom_resource.py index fdf4bce61835..16fe37238ce6 100644 --- a/rllib/tests/test_custom_resource.py +++ b/rllib/tests/test_custom_resource.py @@ -21,8 +21,11 @@ def test_custom_resource(algorithm): .get_default_config() .environment("CartPole-v1") .framework("torch") - .env_runners(num_env_runners=1) - .resources(num_gpus=0, custom_resources_per_worker={"custom_resource": 0.01}) + .env_runners( + num_env_runners=1, + custom_resources_per_env_runner={"custom_resource": 0.01}, + ) + .resources(num_gpus=0) ) stop = {TRAINING_ITERATION: 1} diff --git a/rllib/tests/test_gpus.py b/rllib/tests/test_gpus.py index 919244f2765e..4bbd769c5b8f 100644 --- a/rllib/tests/test_gpus.py +++ b/rllib/tests/test_gpus.py @@ -24,22 +24,22 @@ def test_gpus_in_non_local_mode(self): # Expect errors when we run a config w/ num_gpus>0 w/o a GPU # and _fake_gpus=False. for num_gpus in [0, 0.1, 1, actual_gpus + 4]: - # Only allow possible num_gpus_per_worker (so test would not + # Only allow possible num_gpus_per_env_runner (so test would not # block infinitely due to a down worker). per_worker = ( [0] if actual_gpus == 0 or actual_gpus < num_gpus else [0, 0.5, 1] ) - for num_gpus_per_worker in per_worker: + for num_gpus_per_env_runner in per_worker: for fake_gpus in [False] + ([] if num_gpus == 0 else [True]): config.resources( num_gpus=num_gpus, - num_gpus_per_worker=num_gpus_per_worker, _fake_gpus=fake_gpus, ) + config.env_runners(num_gpus_per_env_runner=num_gpus_per_env_runner) print( f"\n------------\nnum_gpus={num_gpus} " - f"num_gpus_per_worker={num_gpus_per_worker} " + f"num_gpus_per_env_runner={num_gpus_per_env_runner} " f"_fake_gpus={fake_gpus}" ) @@ -49,7 +49,7 @@ def test_gpus_in_non_local_mode(self): for _ in framework_iterator(config, frameworks=frameworks): # Expect that Algorithm creation causes a num_gpu error. if ( - actual_gpus < num_gpus + 2 * num_gpus_per_worker + actual_gpus < num_gpus + 2 * num_gpus_per_env_runner and not fake_gpus ): # "Direct" RLlib (create Algorithm on the driver). diff --git a/rllib/tests/test_placement_groups.py b/rllib/tests/test_placement_groups.py index 44ae64f6c4c3..2e056e09d1a9 100644 --- a/rllib/tests/test_placement_groups.py +++ b/rllib/tests/test_placement_groups.py @@ -71,15 +71,16 @@ def default_resource_request(cls, config): def test_default_resource_request(self): config = ( PPOConfig() + .resources(placement_strategy="SPREAD") .env_runners( num_env_runners=2, + num_cpus_per_env_runner=2, ) .training( model={"fcnet_hiddens": [10]}, lr=tune.grid_search([0.1, 0.01, 0.001]) ) .environment("CartPole-v1") .framework("torch") - .resources(placement_strategy="SPREAD", num_cpus_per_worker=2) ) # 3 Trials: Can only run 1 at a time (num_cpus=6; needed: 5). diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 11999b9f8689..3939f9db9bed 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -305,10 +305,10 @@ def stop_all(self): rollout_fragment_length=4, num_env_runners=1, ) - .resources( + .learners( # We have a train/sample ratio of 1:1 and a batch of 32. - num_learner_workers=1, - num_gpus_per_learner_worker=1, + num_learners=1, + num_gpus_per_learner=1, ) # TODO (simon): Adjust to new model_config_dict. .training( diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py index 5f5829f39a62..23a46fcbf3e7 100644 --- a/rllib/tuned_examples/dreamerv3/atari_100k.py +++ b/rllib/tuned_examples/dreamerv3/atari_100k.py @@ -37,9 +37,11 @@ } ) .resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also diff --git a/rllib/tuned_examples/dreamerv3/atari_200M.py b/rllib/tuned_examples/dreamerv3/atari_200M.py index fdbb31d94aa2..2fb1e48b0929 100644 --- a/rllib/tuned_examples/dreamerv3/atari_200M.py +++ b/rllib/tuned_examples/dreamerv3/atari_200M.py @@ -20,11 +20,14 @@ config = ( DreamerV3Config() .resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, # For each (parallelized) env, we should provide a CPU. Lower this number # if you don't have enough CPUs. - num_cpus_for_local_worker=8 * (num_gpus or 1), + num_cpus_for_main_process=8 + * (num_gpus or 1), + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py index 4e262c9e0e5d..b201900da5f6 100644 --- a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py +++ b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py @@ -22,9 +22,11 @@ # Use image observations. .environment(env_config={"from_pixels": True}) .resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) .env_runners(num_envs_per_env_runner=4 * (num_gpus or 1), remote_worker_envs=True) .reporting( diff --git a/rllib/tuned_examples/dreamerv3/flappy_bird.py b/rllib/tuned_examples/dreamerv3/flappy_bird.py index 1adbbeac44c5..31755b6dfe3c 100644 --- a/rllib/tuned_examples/dreamerv3/flappy_bird.py +++ b/rllib/tuned_examples/dreamerv3/flappy_bird.py @@ -45,9 +45,11 @@ def _env_creator(ctx): ( config.environment("flappy-bird") .resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also diff --git a/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py b/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py index 2de15805451f..14fd1f930703 100644 --- a/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py +++ b/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py @@ -39,15 +39,16 @@ # Further specify the details of our config object. ( config.resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=8 * (num_gpus or 1), + num_cpus_for_main_process=8 * (num_gpus or 1), + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - .env_runners( - num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True - ).reporting( + .env_runners(num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True) + .reporting( metrics_num_episodes_for_smoothing=(num_gpus or 1), report_images_and_videos=False, report_dream_data=False, diff --git a/rllib/tuned_examples/dreamerv3/highway_env.py b/rllib/tuned_examples/dreamerv3/highway_env.py index b96562fb22ea..c3588f502c1a 100644 --- a/rllib/tuned_examples/dreamerv3/highway_env.py +++ b/rllib/tuned_examples/dreamerv3/highway_env.py @@ -38,9 +38,11 @@ ( config.resources( - num_learner_workers=0 if num_gpus == 1 else num_gpus, - num_gpus_per_learner_worker=1 if num_gpus else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, ) .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index e921d3668122..e266f1b64902 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -95,11 +95,11 @@ def stop_all(self): num_env_runners=32, rollout_fragment_length=512, ) - .resources( + .learners( # Let's start with a small number of learner workers and # add later a tune grid search for these resources. - num_learner_workers=1, - num_gpus_per_learner_worker=1, + num_learners=1, + num_gpus_per_learner=1, ) # TODO (simon): Adjust to new model_config_dict. .training( diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index 4066b879bbe0..c8259f8ea91b 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -79,13 +79,13 @@ num_env_runners=num_rollout_workers, # TODO (sven, simon): Add resources. ) - .resources( + .learners( # Let's start with a small number of learner workers and # add later a tune grid search for these resources. # TODO (simon): Either add tune grid search here or make # an extra script to only test scalability. - num_learner_workers=1, - num_gpus_per_learner_worker=1, + num_learners=1, + num_gpus_per_learner=1, ) # TODO (simon): Adjust to new model_config_dict. .training( diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index 41c2e655e6a4..339f978e9beb 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -85,11 +85,11 @@ def stop_all(self): rollout_fragment_length=1, num_env_runners=0, ) - .resources( + .learners( # Note, we have a sample/train ratio of 1:1 and a small train # batch, so 1 learner with a single GPU should suffice. - num_learner_workers=1, - num_gpus_per_learner_worker=1, + num_learners=1, + num_gpus_per_learner=1, ) # TODO (simon): Adjust to new model_config_dict. .training( diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 99f5f2874461..740fa854c539 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -71,11 +71,11 @@ num_env_runners=1, # TODO (sven, simon): Add resources. ) - .resources( + .learners( # Note, we have a small batch and a sample/train ratio # of 1:1, so a single GPU should be enough. - num_learner_workers=1, - num_gpus_per_learner_worker=1, + num_learners=1, + num_gpus_per_learner=1, ) # TODO (simon): Adjust to new model_config_dict. .training( diff --git a/rllib/utils/error.py b/rllib/utils/error.py index 9e5593a47601..3e2b7a8f9474 100644 --- a/rllib/utils/error.py +++ b/rllib/utils/error.py @@ -34,11 +34,11 @@ class NotSerializable(Exception): # ------- # Message explaining there are no GPUs available for the -# num_gpus=n or num_gpus_per_worker=m settings. +# num_gpus=n or num_gpus_per_env_runner=m settings. ERR_MSG_NO_GPUS = """Found {} GPUs on your machine (GPU devices found: {})! If your - machine does not have any GPUs, you should set the config keys `num_gpus` and - `num_gpus_per_worker` to 0 (they may be set to 1 by default for your - particular RL algorithm).""" + machine does not have any GPUs, you should set the config keys + `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0 (they may be set to + 1 by default for your particular RL algorithm).""" ERR_MSG_INVALID_ENV_DESCRIPTOR = """The env string you provided ('{}') is: a) Not a supported/installed environment. diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 099c20a9d088..d3c12063e304 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -217,7 +217,7 @@ def add_rllib_example_script_args( # Learner scaling options. # Old API stack: config.num_gpus. - # New API stack: config.num_learner_workers (w/ num_gpus_per_learner_worker=1). + # New API stack: config.num_learner_workers (w/ num_gpus_per_learner=1). parser.add_argument("--num-gpus", type=int, default=0) # Ray init options. @@ -1374,13 +1374,15 @@ def run_rllib_example_script_experiment( # Define EnvRunner/RolloutWorker scaling and behavior. .env_runners(num_env_runners=args.num_env_runners) # Define compute resources used. + # New stack. + .learners( + num_learners=args.num_gpus, + num_gpus_per_learner=1 if torch.cuda.is_available() else 0, + ) + # Old stack. .resources( - # Old stack. num_gpus=0 if args.enable_new_api_stack else args.num_gpus, - # New stack. - num_learner_workers=args.num_gpus, - num_gpus_per_learner_worker=1 if torch.cuda.is_available() else 0, - num_cpus_for_local_worker=1, + num_cpus_for_main_process=1, ) ) @@ -1601,14 +1603,17 @@ def check_reproducibilty( # type in ci build has enough resources) for num_workers in [0, 2]: algo_config = ( - algo_config.debugging(seed=42) + algo_config.debugging(seed=42).env_runners( + num_env_runners=num_workers, num_envs_per_env_runner=2 + ) + # new API + .learners( + num_gpus_per_learner=int(os.environ.get("RLLIB_NUM_GPUS", "0")), + ) + # old API .resources( - # old API num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), - # new API - num_gpus_per_learner_worker=int(os.environ.get("RLLIB_NUM_GPUS", "0")), ) - .env_runners(num_env_runners=num_workers, num_envs_per_env_runner=2) ) for fw in framework_iterator(algo_config, **fw_kwargs): diff --git a/rllib/utils/torch_utils.py b/rllib/utils/torch_utils.py index 2a02dfffa768..a8e77c9af356 100644 --- a/rllib/utils/torch_utils.py +++ b/rllib/utils/torch_utils.py @@ -4,12 +4,11 @@ from typing import Dict, List, Optional, TYPE_CHECKING, Union import gymnasium as gym -import numpy as np -import tree # pip install dm_tree from gymnasium.spaces import Discrete, MultiDiscrete +import numpy as np from packaging import version +import tree # pip install dm_tree -import ray from ray.rllib.models.repeated_values import RepeatedValues from ray.rllib.utils.annotations import Deprecated, PublicAPI, DeveloperAPI from ray.rllib.utils.framework import try_import_torch @@ -442,50 +441,6 @@ def flatten_inputs_to_1d_tensor( return merged -@PublicAPI -def get_device(config): - """Returns a torch device edepending on a config and current worker index.""" - - # Figure out the number of GPUs to use on the local side (index=0) or on - # the remote workers (index > 0). - worker_idx = config.get("worker_index", 0) - if ( - not config["_fake_gpus"] - and ray._private.worker._mode() == ray._private.worker.LOCAL_MODE - ): - num_gpus = 0 - elif worker_idx == 0: - num_gpus = config["num_gpus"] - else: - num_gpus = config["num_gpus_per_worker"] - # All GPU IDs, if any. - gpu_ids = list(range(torch.cuda.device_count())) - - # Place on one or more CPU(s) when either: - # - Fake GPU mode. - # - num_gpus=0 (either set by user or we are in local_mode=True). - # - No GPUs available. - if config["_fake_gpus"] or num_gpus == 0 or not gpu_ids: - return torch.device("cpu") - # Place on one or more actual GPU(s), when: - # - num_gpus > 0 (set by user) AND - # - local_mode=False AND - # - actual GPUs available AND - # - non-fake GPU mode. - else: - # We are a remote worker (WORKER_MODE=1): - # GPUs should be assigned to us by ray. - if ray._private.worker._mode() == ray._private.worker.WORKER_MODE: - gpu_ids = ray.get_gpu_ids() - - if len(gpu_ids) < num_gpus: - raise ValueError( - "TorchPolicy was not able to find enough GPU IDs! Found " - f"{gpu_ids}, but num_gpus={num_gpus}." - ) - return torch.device("cuda") - - @PublicAPI def global_norm(tensors: List[TensorType]) -> TensorType: """Returns the global L2 norm over a list of tensors. From 03be7f54f14035629e1e159b45f5aea751980a02 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 06:59:35 +0200 Subject: [PATCH 08/23] wip Signed-off-by: sven1977 --- rllib/BUILD | 12 +++- .../gpus/fractional_gpus_per_learner.py | 64 ++++++++++++------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 3e628610d6ba..e579842c055a 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2441,12 +2441,20 @@ py_test( # .................................... py_test( - name = "examples/gpus/fractional_gpus_per_learner", + name = "examples/gpus/fractional_0.5_gpus_per_learner", main = "examples/gpus/fractional_gpus_per_learner.py", tags = ["team:rllib", "exclusive", "examples", "gpu"], size = "medium", srcs = ["examples/gpus/fractional_gpus_per_learner.py"], - args = ["--enable-new-api-stack", --as-test", "--stop-reward=40.0", "--num-gpus-per-learner=0.5", "--num-learners=2"] + args = ["--enable-new-api-stack", --as-test", "--stop-reward=40.0", "--num-learners=1", "--num-gpus-per-learner=0.5"] +) +py_test( + name = "examples/gpus/fractional_0.3_gpus_per_learner", + main = "examples/gpus/fractional_gpus_per_learner.py", + tags = ["team:rllib", "exclusive", "examples", "gpu"], + size = "medium", + srcs = ["examples/gpus/fractional_gpus_per_learner.py"], + args = ["--enable-new-api-stack", --as-test", "--stop-reward=40.0", "--num-learners=1", "--num-gpus-per-learner=0.3"] ) # subdirectory: hierarchical/ diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py index b8a935b29c40..33fb5052d5c1 100644 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -10,6 +10,17 @@ `python [script file name].py --enable-new-api-stack --num-learner-workers= [number of Learner workers, e.g. 1] --num-gpus-per-learner [some fraction <1.0]` +The following command line combinations been tested on a 4 NVIDIA T4 GPUs (16 vCPU) +machine. +Note that for each run, 4 tune trials will be setup; see tune.grid_search over 4 +learning rates in the `base_config` below: +1) --num-learners=1 --num-gpus-per-learner=0.5 (2.0 GPUs used). +2) --num-learners=1 --num-gpus-per-learner=0.3 (1.2 GPUs used). +3) --num-learners=1 --num-gpus-per-learner=0.25 (1.0 GPU used). +4) non-sensical setting: --num-learners=2 --num-gpus-per-learner=0.5 (expect an +NCCL-related error due to the fact that torch will try to perform DDP sharding, +but notices that the shards sit on the same GPU). + For debugging, use the following additional command line options `--no-tune --num-env-runners=0` which should allow you to set breakpoints anywhere in the RLlib code and @@ -25,15 +36,38 @@ You can visualize experiment results in ~/ray_results using TensorBoard. - +Results to expect +----------------- +In the console output, you can see that only fractional GPUs are being used by RLlib: + +== Status == +... +Logical resource usage: 12.0/16 CPUs, 1.0/4 GPUs (...) +... +Number of trials: 4/4 (4 RUNNING) + +The final output should look something like this: ++-----------------------------+------------+-----------------+--------+--------+ +| Trial name | status | loc | lr | iter | +| | | | | | +|-----------------------------+------------+-----------------+--------+--------+ +| PPO_CartPole-v1_7104b_00000 | TERMINATED | 10.0.0.39:31197 | 0.005 | 10 | +| PPO_CartPole-v1_7104b_00001 | TERMINATED | 10.0.0.39:31202 | 0.003 | 11 | +| PPO_CartPole-v1_7104b_00002 | TERMINATED | 10.0.0.39:31203 | 0.001 | 10 | +| PPO_CartPole-v1_7104b_00003 | TERMINATED | 10.0.0.39:31204 | 0.0001 | 11 | ++-----------------------------+------------+-----------------+--------+--------+ + ++----------------+----------------------+----------------------+----------------------+ +| total time (s) | num_env_steps_sample | num_env_steps_traine | num_episodes_lifetim | +| | d_lifetime | d_lifetime | e | +|----------------+----------------------+----------------------+----------------------| +| 101.002 | 40000 | 40000 | 346 | +| 110.03 | 44000 | 44000 | 395 | +| 101.171 | 40000 | 40000 | 328 | +| 110.091 | 44000 | 44000 | 478 | ++----------------+----------------------+----------------------+----------------------+ """ from ray import tune -from ray.air.constants import TRAINING_ITERATION -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, run_rllib_example_script_experiment, @@ -55,22 +89,6 @@ args.enable_new_api_stack ), "Must set --enable-new-api-stack when running this script!" - # These configs have been tested on a p2.8xlarge machine (8 GPUs, 16 CPUs), - # where ray was started using only one of these GPUs: - # $ ray start --num-gpus=1 --head - - # Tested arg combinations (4 tune trials will be setup; see - # tune.grid_search over 4 learning rates below): - # - num_gpus=0.5 (2 tune trials should run in parallel). - # - num_gpus=0.3 (3 tune trials should run in parallel). - # - num_gpus=0.25 (4 tune trials should run in parallel) - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (1 worker) -> 0.3 - # -> 3 tune trials should run in parallel. - # - num_gpus=0.2 + num_gpus_per_worker=0.1 (2 workers) -> 0.4 - # -> 2 tune trials should run in parallel. - # - num_gpus=0.4 + num_gpus_per_worker=0.1 (2 workers) -> 0.6 - # -> 1 tune trial should run in parallel. - base_config = ( get_trainable_cls(args.algo) .get_default_config() From e6a8a2e8cdc65bbab751431e5bfc94ef9477a485 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 11:53:15 +0200 Subject: [PATCH 09/23] wip Signed-off-by: sven1977 --- .../config/vocabularies/RLlib/accept.txt | 3 + doc/source/rllib/rllib-examples.rst | 184 +---------- rllib/examples/README.rst | 299 ++++++++++++++++++ .../attention_net_supervised.py | 1 + .../_old_api_stack/complex_struct_space.py | 1 + .../connectors/adapt_connector_policy.py | 1 + .../connectors/prepare_checkpoint.py | 1 + .../connectors/run_connector_policy.py | 1 + .../self_play_with_policy_checkpoint.py | 1 + .../_old_api_stack/custom_keras_model.py | 1 + .../models/action_mask_model.py | 1 + .../models/autoregressive_action_dist.py | 1 + .../models/autoregressive_action_model.py | 1 + .../_old_api_stack/models/batch_norm_model.py | 1 + .../models/centralized_critic_models.py | 1 + .../_old_api_stack/models/custom_model_api.py | 1 + .../_old_api_stack/models/eager_model.py | 1 + .../_old_api_stack/models/fast_model.py | 1 + .../models/mobilenet_v2_encoder.py | 1 + .../models/mobilenet_v2_with_lstm_models.py | 1 + .../examples/_old_api_stack/models/modelv3.py | 1 + .../_old_api_stack/models/neural_computer.py | 1 + .../models/parametric_actions_model.py | 1 + .../_old_api_stack/models/rnn_model.py | 1 + .../_old_api_stack/models/rnn_spy_model.py | 1 + .../models/shared_weights_model.py | 1 + .../_old_api_stack/models/simple_rpg_model.py | 1 + .../trajectory_view_utilizing_models.py | 1 + .../parametric_actions_cartpole.py | 1 + ...ons_cartpole_embeddings_learnt_by_model.py | 1 + .../policy/cliff_walking_wall_policy.py | 1 + .../policy/episode_env_aware_policy.py | 1 + .../policy/memory_leaking_policy.py | 1 + .../_old_api_stack/policy/random_policy.py | 1 + .../policy/rock_paper_scissors_dummies.py | 1 + .../remote_base_env_with_custom_api.py | 1 + ...e_envs_with_inference_done_on_main_node.py | 1 + .../_old_api_stack/sb2rllib_rllib_example.py | 1 + .../_old_api_stack/sb2rllib_sb_example.py | 1 + rllib/examples/action_masking.py | 2 + rllib/examples/algorithms/README.rst | 5 + ...raining_step_on_and_off_policy_combined.py | 2 + rllib/examples/autoregressive_action_dist.py | 2 +- rllib/examples/cartpole_lstm.py | 3 +- rllib/examples/catalogs/README.rst | 0 .../catalogs/custom_action_distribution.py | 2 +- .../examples/catalogs/mobilenet_v2_encoder.py | 2 +- rllib/examples/centralized_critic.py | 2 +- rllib/examples/centralized_critic_2.py | 2 +- .../examples/checkpoint_by_custom_criteria.py | 6 - rllib/examples/checkpoints/README.rst | 0 .../checkpoints/cartpole_dqn_export.py | 2 +- rllib/examples/checkpoints/onnx_tf.py | 1 + rllib/examples/checkpoints/onnx_torch.py | 2 +- .../restore_1_of_n_agents_from_checkpoint.py | 2 +- ...e_adapted_gae_on_postprocess_trajectory.py | 2 + rllib/examples/connectors/README.rst | 1 + rllib/examples/curriculum/README.rst | 0 .../examples/custom_metrics_and_callbacks.py | 2 +- rllib/examples/custom_model_api.py | 1 + .../examples/custom_model_loss_and_metrics.py | 3 +- .../custom_recurrent_rnn_tokenizer.py | 2 +- rllib/examples/debugging/README.rst | 0 .../debugging/deterministic_training.py | 2 +- rllib/examples/envs/README.rst | 0 rllib/examples/envs/greyscale_env.py | 1 + rllib/examples/envs/unity3d_env_local.py | 2 +- rllib/examples/evaluation/README.rst | 0 rllib/examples/gpus/README.rst | 0 rllib/examples/hierarchical/README.rst | 0 .../hierarchical/hierarchical_training.py | 2 +- rllib/examples/inference/README.rst | 0 .../policy_inference_after_training.py | 1 + ...inference_after_training_with_attention.py | 1 + ...licy_inference_after_training_with_lstm.py | 1 + rllib/examples/learners/README.rst | 0 rllib/examples/multi_agent/README.rst | 0 rllib/examples/multi_agent/two_algorithms.py | 2 +- rllib/examples/offline_rl/README.rst | 0 rllib/examples/offline_rl/custom_input_api.py | 2 +- rllib/examples/offline_rl/offline_rl.py | 2 +- .../examples/offline_rl/saving_experiences.py | 2 +- rllib/examples/ray_serve/README.rst | 0 rllib/examples/ray_tune/README.rst | 0 rllib/examples/replay_buffer_api.py | 2 +- rllib/examples/rl_modules/README.rst | 0 .../examples/rl_modules/action_masking_rlm.py | 6 - .../rl_modules/episode_env_aware_rlm.py | 6 - .../examples/rl_modules/frame_stacking_rlm.py | 12 - rllib/examples/rl_modules/mobilenet_rlm.py | 6 - rllib/examples/rl_modules/random_rl_module.py | 6 - 91 files changed, 379 insertions(+), 245 deletions(-) create mode 100644 rllib/examples/README.rst create mode 100644 rllib/examples/algorithms/README.rst create mode 100644 rllib/examples/catalogs/README.rst delete mode 100644 rllib/examples/checkpoint_by_custom_criteria.py create mode 100644 rllib/examples/checkpoints/README.rst create mode 100644 rllib/examples/connectors/README.rst create mode 100644 rllib/examples/curriculum/README.rst create mode 100644 rllib/examples/debugging/README.rst create mode 100644 rllib/examples/envs/README.rst create mode 100644 rllib/examples/evaluation/README.rst create mode 100644 rllib/examples/gpus/README.rst create mode 100644 rllib/examples/hierarchical/README.rst create mode 100644 rllib/examples/inference/README.rst create mode 100644 rllib/examples/learners/README.rst create mode 100644 rllib/examples/multi_agent/README.rst create mode 100644 rllib/examples/offline_rl/README.rst create mode 100644 rllib/examples/ray_serve/README.rst create mode 100644 rllib/examples/ray_tune/README.rst create mode 100644 rllib/examples/rl_modules/README.rst delete mode 100644 rllib/examples/rl_modules/action_masking_rlm.py delete mode 100644 rllib/examples/rl_modules/episode_env_aware_rlm.py delete mode 100644 rllib/examples/rl_modules/frame_stacking_rlm.py delete mode 100644 rllib/examples/rl_modules/mobilenet_rlm.py delete mode 100644 rllib/examples/rl_modules/random_rl_module.py diff --git a/.vale/styles/config/vocabularies/RLlib/accept.txt b/.vale/styles/config/vocabularies/RLlib/accept.txt index 9ca4e388607d..54d9a0546b3b 100644 --- a/.vale/styles/config/vocabularies/RLlib/accept.txt +++ b/.vale/styles/config/vocabularies/RLlib/accept.txt @@ -9,9 +9,12 @@ config (IMPALA|impala) hyperparameters? MARLModule +MLAgents +multiagent postprocessing (PPO|ppo) [Pp]y[Tt]orch +pragmas? (RL|rl)lib RLModule rollout diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index 9f3af7d3163c..4671cc601736 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -2,186 +2,4 @@ .. include:: /_includes/rllib/new_api_stack.rst -Examples -======== - -This page is an index of examples for the various use cases and features of RLlib. - -If any example is broken, or if you'd like to add an example to this page, -feel free to raise an issue on our Github repository. - -Tuned Examples --------------- - -- `Tuned examples `__: - Collection of tuned hyperparameters sorted by algorithm. - - -Environments and Adapters -------------------------- - -- `Registering a custom env and model `__: - Example of defining and registering a gym env and model for use with RLlib. -- `Local Unity3D multi-agent environment example `__: - Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to - learn any Unity3D game (including support for multi-agent). - Use this example to try things out and watch the game and the learning progress live in the editor. - Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. - For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. - -Custom- and Complex Models --------------------------- - -- `Custom Keras model `__: - Example of using a custom Keras model. -- `Registering a custom model with supervised loss `__: - Example of defining and registering a custom model with a supervised loss. -- `Batch normalization `__: - Example of adding batch norm layers to a custom model. -- `Custom model API example `__: - Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. -- `Trajectory View API utilizing model `__: - An example on how a model can use the trajectory view API to specify its own input. -- `MobileNetV2 wrapping example model `__: - Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. -- `Differentiable Neural Computer `__: - Example of DeepMind's Differentiable Neural Computer for partially-observable environments. - - -Training Workflows ------------------- - -- `Custom training workflows `__: - Example of how to use Tune's support for custom training functions to implement custom training workflows. -- `Curriculum learning with the TaskSettableEnv API `__: - Example of how to advance the environment through different phases (tasks) over time. - Also see the `curriculum learning how-to `__ from the documentation here. -- `Custom logger `__: - How to setup a custom Logger object in RLlib. -- `Custom metrics `__: - Example of how to output custom training metrics to TensorBoard. -- `Custom Algorith.training_step() method combining on- and off-policy learning `__: - Example of how to use the exec. plan of an Algorithm to trin two different policies in parallel (also using multi-agent API). -- `Custom tune experiment `__: - How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. - - -Evaluation: ------------ -- `Custom evaluation function `__: - Example of how to write a custom evaluation function that is called instead of the default behavior, which is running with the evaluation worker set through n episodes. -- `Parallel evaluation and training `__: - Example showing how the evaluation workers and the "normal" rollout workers can run (to some extend) in parallel to speed up training. - - -Serving and Offline -------------------- -- `Offline RL with CQL `__: - Example showing how to run an offline RL training job using a historic-data json file. -- `Another example for using RLlib with Ray Serve `__ - This script offers a simple workflow for 1) training a policy with RLlib first, 2) creating a new policy 3) restoring its weights from the trained - one and serving the new policy via Ray Serve. -- `Unity3D client/server `__: - Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting - clients against a central RLlib Policy server learning how to play the game. - The n distributed clients could themselves be servers for external/human players and allow for control - being fully in the hands of the Unity entities instead of RLlib. - Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. -- `CartPole client/server `__: - Example of online serving of predictions for a simple CartPole policy. -- `Saving experiences `__: - Example of how to externally generate experience batches in RLlib-compatible format. -- `Finding a checkpoint using custom criteria `__: - Example of how to find a :ref:`checkpoint ` after a `Tuner.fit()` via some custom defined criteria. - - -Multi-Agent and Hierarchical ----------------------------- - -- `Simple independent multi-agent setup vs a PettingZoo env `__: - Setup RLlib to run any algorithm in (independent) multi-agent mode against a multi-agent environment. -- `More complex (shared-parameter) multi-agent setup vs a PettingZoo env `__: - Setup RLlib to run any algorithm in (shared-parameter) multi-agent mode against a multi-agent environment. -- `Rock-paper-scissors heuristic vs learned `__ and `Rock-paper-scissors learned vs learned `__: - Two examples of different heuristic and learned policies competing against each other in the rock-paper-scissors environment. -- `Two-step game `__: - Example on how to use agent grouping in a multi-agent environment (the two-step game from the `QMIX paper `__). -- `PettingZoo multi-agent example `__: - Example on how to use RLlib to learn in `PettingZoo `__ multi-agent environments. -- `PPO with centralized critic on two-step game `__: - Example of customizing PPO to leverage a centralized value function. -- `Centralized critic in the env `__: - A simpler method of implementing a centralized critic by augmentating agent observations with global information. -- `Hand-coded policy `__: - Example of running a custom hand-coded policy alongside trainable policies. -- `Weight sharing between policies `__: - Example of how to define weight-sharing layers between two different policies. -- `Multiple algorithms `__: - Example of alternating training between DQN and PPO. -- `Hierarchical training `__: - Example of hierarchical training using the multi-agent API. - - -Special Action- and Observation Spaces --------------------------------------- - -- `Nested action spaces `__: - Learning in arbitrarily nested action spaces. -- `Parametric actions `__: - Example of how to handle variable-length or parametric action spaces. -- `Using the "Repeated" space of RLlib for variable lengths observations `__: - How to use RLlib's `Repeated` space to handle variable length observations. -- `Autoregressive action distribution example `__: - Learning with auto-regressive action dependencies (e.g. 2 action components; distribution for 2nd component depends on the 1st component's actually sampled value). - - -Community Examples ------------------- -- `Arena AI `__: - A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence - with RLlib-generated baselines. -- `CARLA `__: - Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. -- `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: - Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the - "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). -- `Flatland `__: - A dense traffic simulating environment with RLlib-generated baselines. -- `GFootball `__: - Example of setting up a multi-agent version of `GFootball `__ with RLlib. -- `mobile-env `__: - An open, minimalist Gymnasium environment for autonomous coordination in wireless mobile networks. - Includes an example notebook using Ray RLlib for multi-agent RL with mobile-env. -- `Neural MMO `__: - A multiagent AI research environment inspired by Massively Multiplayer Online (MMO) role playing games – - self-contained worlds featuring thousands of agents per persistent macrocosm, diverse skilling systems, local and global economies, complex emergent social structures, - and ad-hoc high-stakes single and team based conflict. -- `NeuroCuts `__: - Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. -- `NeuroVectorizer `__: - Example of learning optimal LLVM vectorization compiler pragmas for loops in C and C++ codes using RLlib. -- `Roboschool / SageMaker `__: - Example of training robotic control policies in SageMaker with RLlib. -- `Sequential Social Dilemma Games `__: - Example of using the multi-agent API to model several `social dilemma games `__. -- `Simple custom environment for single RL with Ray and RLlib `__: - Create a custom environment and train a single agent RL using Ray 2.0 with Tune. -- `StarCraft2 `__: - Example of training in StarCraft2 maps with RLlib / multi-agent. -- `Traffic Flow `__: - Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. - - -Blog Posts ----------- - -- `Attention Nets and More with RLlib’s Trajectory View API `__: - Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. -- `Reinforcement Learning with RLlib in the Unity Game Engine `__: - How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. -- `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: - Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. -- `Scaling Multi-Agent Reinforcement Learning `__: - Blog post of a brief tutorial on multi-agent RL and its design in RLlib. -- `Functional RL with Keras and TensorFlow Eager `__: - Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. +.. include:: ../../../rllib/examples/README.rst diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst new file mode 100644 index 000000000000..79d191280407 --- /dev/null +++ b/rllib/examples/README.rst @@ -0,0 +1,299 @@ +Examples +======== + +This page contains an index of all the python scripts in the `examples/ folder ` +of RLlib demonstrating the different use cases and features of the library. + +.. note:: + + RLlib is currently in a transition state from "old API stack" to "new API stack". + Some of the examples here haven't been translated yet to the new stack and are tagged + with the following comment line on top: `# @OldAPIStack`. We are in the + process of moving all example scripts over to the "new API stack" and expect to complete + this effort in the near future. + +.. note:: + + If any new-API-stack example is broken, or if you'd like to add an example to this page, + feel free to raise an issue on RLlib's `Github repository `. + + +Folder Structure +++++++++++++++++ +The `examples/ folder ` is +structured into several sub-directories, the contents of all of which are described in detail below. + + +How to run an example script +++++++++++++++++++++++++++++ + +Most of the example scripts are self-executable, meaning you can just cd into the respective +directory and type: + +.. code-block:: bash + + $ cd examples/multi_agent + $ python multi_agent_pendulum.py --enable-new-api-stack --num-agents=2 + + +Use the `--help` command line arg to have each script print out its specific command line options. +However, most of the scripts share a common subset of generally applicable command line args, for example +`--num-env-runners`, `--no-tune`, or `--wandb-key`. + + +All sub-folders ++++++++++++++++ + + +Algorithms +---------- + +.. include:: algorithms/README.rst + +Catalogs +-------- + +.. include:: catalogs/README.rst + +Checkpoints +----------- + +.. include:: catalogs/README.rst + +Connectors +---------- + +.. include:: connectors/README.rst + + +Curriculum Learning +------------------- + + +Debugging +--------- + + +Environments +------------ + +Evaluation +---------- + +GPU (for Training and Sampling) +------------------------------- + + +Hierarchical Training +--------------------- + + +Inference (of Models/Policies) +------------------------------ + + +Learners +-------- + + +Multi-Agent RL +-------------- + +Offline RL +---------- + +Ray Serve and RLlib +------------------- + +Ray Tune and RLlib +------------------ + +RLModules +--------- + + + + + + + +TODO: Link to tuned_examples, which should have their own README.rst + +Tuned Examples +-------------- + +- `Tuned examples `__: + Collection of tuned hyperparameters sorted by algorithm. + + + + +************************************************************** + + +Environments and Adapters +------------------------- + +- `Registering a custom env and model `__: + Example of defining and registering a gym env and model for use with RLlib. +- `Local Unity3D multi-agent environment example `__: + Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to + learn any Unity3D game (including support for multi-agent). + Use this example to try things out and watch the game and the learning progress live in the editor. + Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. + For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. + +Custom- and Complex Models +-------------------------- + +- `Custom Keras model `__: + Example of using a custom Keras model. +- `Registering a custom model with supervised loss `__: + Example of defining and registering a custom model with a supervised loss. +- `Batch normalization `__: + Example of adding batch norm layers to a custom model. +- `Custom model API example `__: + Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. +- `Trajectory View API utilizing model `__: + An example on how a model can use the trajectory view API to specify its own input. +- `MobileNetV2 wrapping example model `__: + Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. +- `Differentiable Neural Computer `__: + Example of DeepMind's Differentiable Neural Computer for partially observable environments. + + +Training Workflows +------------------ + +- `Custom training workflows `__: + Example of how to use Tune's support for custom training functions to implement custom training workflows. +- `Curriculum learning with the TaskSettableEnv API `__: + Example of how to advance the environment through different phases (tasks) over time. + Also see the `curriculum learning how-to `__ from the documentation here. +- `Custom logger `__: + How to setup a custom Logger object in RLlib. +- `Custom metrics `__: + Example of how to output custom training metrics to TensorBoard. +- `Custom tune experiment `__: + How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. + + +Evaluation: +----------- +- `Custom evaluation function `__: + Example of how to write a custom evaluation function that's called instead of the default behavior, which is running with the evaluation worker set through n episodes. +- `Parallel evaluation and training `__: + Example showing how the evaluation workers and the "normal" rollout workers can run (to some extend) in parallel to speed up training. + + +Serving and Offline +------------------- +- `Offline RL with CQL `__: + Example showing how to run an offline RL training job using a historic-data JSON file. +- `Another example for using RLlib with Ray Serve `__ + This script offers a simple workflow for 1) training a policy with RLlib first, 2) creating a new policy 3) restoring its weights from the trained + one and serving the new policy with Ray Serve. +- `Unity3D client/server `__: + Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting + clients against a central RLlib Policy server learning how to play the game. + The n distributed clients could themselves be servers for external/human players and allow for control + being fully in the hands of the Unity entities instead of RLlib. + Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. +- `CartPole client/server `__: + Example of online serving of predictions for a simple CartPole policy. +- `Saving experiences `__: + Example of how to externally generate experience batches in RLlib-compatible format. +- `Finding a checkpoint using custom criteria `__: + Example of how to find a :ref:`checkpoint ` after a `Tuner.fit()` with some custom defined criteria. + + +Multi-Agent and Hierarchical +---------------------------- + +- `Simple independent multi-agent setup vs a PettingZoo env `__: + Setup RLlib to run any algorithm in (independent) multi-agent mode against a multi-agent environment. +- `More complex (shared-parameter) multi-agent setup vs a PettingZoo env `__: + Setup RLlib to run any algorithm in (shared-parameter) multi-agent mode against a multi-agent environment. +- `Rock-paper-scissors heuristic vs learned `__ and `Rock-paper-scissors learned vs learned `__: + Two examples of different heuristic and learned policies competing against each other in the rock-paper-scissors environment. +- `Two-step game `__: + Example on how to use agent grouping in a multi-agent environment (the two-step game from the `QMIX paper `__). +- `PettingZoo multi-agent example `__: + Example on how to use RLlib to learn in `PettingZoo `__ multi-agent environments. +- `PPO with centralized critic on two-step game `__: + Example of customizing PPO to leverage a centralized value function. +- `Centralized critic in the env `__: + A simpler method of implementing a centralized critic by augmenting agent observations with global information. +- `Hand-coded policy `__: + Example of running a custom hand-coded policy alongside trainable policies. +- `Weight sharing between policies `__: + Example of how to define weight-sharing layers between two different policies. +- `Multiple algorithms `__: + Example of alternating training between DQN and PPO. +- `Hierarchical training `__: + Example of hierarchical training using the multi-agent API. + + +Special Action- and Observation Spaces +-------------------------------------- + +- `Nested action spaces `__: + Learning in arbitrarily nested action spaces. +- `Parametric actions `__: + Example of how to handle variable-length or parametric action spaces. +- `Using the "Repeated" space of RLlib for variable lengths observations `__: + How to use RLlib's `Repeated` space to handle variable length observations. +- `Autoregressive action distribution example `__: + Learning with auto-regressive action dependencies (for example two action components, where distribution for the second component depends on the first's actually sampled value). + + +Community Examples +------------------ +- `Arena AI `__: + A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence + with RLlib-generated baselines. +- `CARLA `__: + Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. +- `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: + Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the + "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). +- `Flatland `__: + A dense traffic simulating environment with RLlib-generated baselines. +- `GFootball `__: + Example of setting up a multi-agent version of `GFootball `__ with RLlib. +- `mobile-env `__: + An open, minimalist Gymnasium environment for autonomous coordination in wireless mobile networks. + Includes an example notebook using Ray RLlib for multi-agent RL with mobile-env. +- `Neural MMO `__: + A multiagent AI research environment inspired by Massively Multiplayer Online (MMO) role playing games – + self-contained worlds featuring thousands of agents per persistent macrocosm, diverse skilling systems, local and global economies, complex emergent social structures, + and ad-hoc high-stakes single and team based conflict. +- `NeuroCuts `__: + Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. +- `NeuroVectorizer `__: + Example of learning optimal LLVM vectorization compiler pragmas for loops in C and C++ codes using RLlib. +- `Roboschool / SageMaker `__: + Example of training robotic control policies in SageMaker with RLlib. +- `Sequential Social Dilemma Games `__: + Example of using the multi-agent API to model several `social dilemma games `__. +- `Simple custom environment for single RL with Ray and RLlib `__: + Create a custom environment and train a single agent RL using Ray 2.0 with Tune. +- `StarCraft2 `__: + Example of training in StarCraft2 maps with RLlib / multi-agent. +- `Traffic Flow `__: + Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. + + +Blog Posts +---------- + +- `Attention Nets and More with RLlib’s Trajectory View API `__: + Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. +- `Reinforcement Learning with RLlib in the Unity Game Engine `__: + How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. +- `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: + Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. +- `Scaling Multi-Agent Reinforcement Learning `__: + Blog post of a brief tutorial on multi-agent RL and its design in RLlib. +- `Functional RL with Keras and TensorFlow Eager `__: + Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. diff --git a/rllib/examples/_old_api_stack/attention_net_supervised.py b/rllib/examples/_old_api_stack/attention_net_supervised.py index d5615f8f042f..2c0f13f506aa 100644 --- a/rllib/examples/_old_api_stack/attention_net_supervised.py +++ b/rllib/examples/_old_api_stack/attention_net_supervised.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Box, Discrete import numpy as np diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py index 901444373127..4603ed882250 100644 --- a/rllib/examples/_old_api_stack/complex_struct_space.py +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -1,3 +1,4 @@ +# @OldAPIStack """Example of using variable-length Repeated / struct observation spaces. This example demonstrates the following: diff --git a/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py index b4dcb535b230..ce2e99211ae4 100644 --- a/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack """This example script shows how to load a connector enabled policy, and adapt/use it with a different version of the environment. """ diff --git a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py index 35d151341fcb..42c789f5ea17 100644 --- a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py @@ -1,3 +1,4 @@ +# @OldAPIStack import random from ray.rllib.algorithms.appo import APPOConfig diff --git a/rllib/examples/_old_api_stack/connectors/run_connector_policy.py b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py index 8a84763f99af..7daf136cdc66 100644 --- a/rllib/examples/_old_api_stack/connectors/run_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack """This example script shows how to load a connector enabled policy, and use it in a serving/inference setting. """ diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index 8781b4b48383..460b7ad79058 100644 --- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -1,3 +1,4 @@ +# @OldAPIStack """Example showing how one can restore a connector enabled TF policy checkpoint for a new self-play PyTorch training job. The checkpointed policy may be trained with a different algorithm too. diff --git a/rllib/examples/_old_api_stack/custom_keras_model.py b/rllib/examples/_old_api_stack/custom_keras_model.py index c49010ef16aa..cdf1f516ef32 100644 --- a/rllib/examples/_old_api_stack/custom_keras_model.py +++ b/rllib/examples/_old_api_stack/custom_keras_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack """Example of using a custom ModelV2 Keras-style model.""" import argparse diff --git a/rllib/examples/_old_api_stack/models/action_mask_model.py b/rllib/examples/_old_api_stack/models/action_mask_model.py index 02630806794e..92fe99e53847 100644 --- a/rllib/examples/_old_api_stack/models/action_mask_model.py +++ b/rllib/examples/_old_api_stack/models/action_mask_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Dict from ray.rllib.models.tf.fcnet import FullyConnectedNetwork diff --git a/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py b/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py index 6afc6cf28274..fd8f2d53f778 100644 --- a/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py +++ b/rllib/examples/_old_api_stack/models/autoregressive_action_dist.py @@ -1,3 +1,4 @@ +# @OldAPIStack from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution from ray.rllib.models.torch.torch_action_dist import ( TorchCategorical, diff --git a/rllib/examples/_old_api_stack/models/autoregressive_action_model.py b/rllib/examples/_old_api_stack/models/autoregressive_action_model.py index 758435d9cda2..8b71e5ab9dc2 100644 --- a/rllib/examples/_old_api_stack/models/autoregressive_action_model.py +++ b/rllib/examples/_old_api_stack/models/autoregressive_action_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Discrete, Tuple from ray.rllib.models.tf.misc import normc_initializer diff --git a/rllib/examples/_old_api_stack/models/batch_norm_model.py b/rllib/examples/_old_api_stack/models/batch_norm_model.py index accb8dc3a8b7..7a5ac956d24e 100644 --- a/rllib/examples/_old_api_stack/models/batch_norm_model.py +++ b/rllib/examples/_old_api_stack/models/batch_norm_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/centralized_critic_models.py b/rllib/examples/_old_api_stack/models/centralized_critic_models.py index d15c6e6a1834..5ccc4448e542 100644 --- a/rllib/examples/_old_api_stack/models/centralized_critic_models.py +++ b/rllib/examples/_old_api_stack/models/centralized_critic_models.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Box from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/custom_model_api.py b/rllib/examples/_old_api_stack/models/custom_model_api.py index 27ef65f68f13..7297faa89038 100644 --- a/rllib/examples/_old_api_stack/models/custom_model_api.py +++ b/rllib/examples/_old_api_stack/models/custom_model_api.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Box from ray.rllib.models.tf.fcnet import FullyConnectedNetwork diff --git a/rllib/examples/_old_api_stack/models/eager_model.py b/rllib/examples/_old_api_stack/models/eager_model.py index 5d030d32d695..1628fcda4abe 100644 --- a/rllib/examples/_old_api_stack/models/eager_model.py +++ b/rllib/examples/_old_api_stack/models/eager_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack import random from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/fast_model.py b/rllib/examples/_old_api_stack/models/fast_model.py index 1a1e24c3f30b..99ac7e83a7de 100644 --- a/rllib/examples/_old_api_stack/models/fast_model.py +++ b/rllib/examples/_old_api_stack/models/fast_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.torch.misc import SlimFC diff --git a/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py b/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py index 6a3482f547b0..34baf73f4ef5 100644 --- a/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py +++ b/rllib/examples/_old_api_stack/models/mobilenet_v2_encoder.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ This file implements a MobileNet v2 Encoder. It uses MobileNet v2 to encode images into a latent space of 1000 dimensions. diff --git a/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py b/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py index c8c9b1f0bff0..fc0b310c4ed0 100644 --- a/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py +++ b/rllib/examples/_old_api_stack/models/mobilenet_v2_with_lstm_models.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/modelv3.py b/rllib/examples/_old_api_stack/models/modelv3.py index 805c87e8a98d..a93879510455 100644 --- a/rllib/examples/_old_api_stack/models/modelv3.py +++ b/rllib/examples/_old_api_stack/models/modelv3.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np from ray.rllib.policy.sample_batch import SampleBatch diff --git a/rllib/examples/_old_api_stack/models/neural_computer.py b/rllib/examples/_old_api_stack/models/neural_computer.py index 90c8b0cb3493..d863f71e62d7 100644 --- a/rllib/examples/_old_api_stack/models/neural_computer.py +++ b/rllib/examples/_old_api_stack/models/neural_computer.py @@ -1,3 +1,4 @@ +# @OldAPIStack from collections import OrderedDict import gymnasium as gym from typing import Union, Dict, List, Tuple diff --git a/rllib/examples/_old_api_stack/models/parametric_actions_model.py b/rllib/examples/_old_api_stack/models/parametric_actions_model.py index 20711553b82b..e568b8bed72a 100644 --- a/rllib/examples/_old_api_stack/models/parametric_actions_model.py +++ b/rllib/examples/_old_api_stack/models/parametric_actions_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Box from ray.rllib.algorithms.dqn.distributional_q_tf_model import DistributionalQTFModel diff --git a/rllib/examples/_old_api_stack/models/rnn_model.py b/rllib/examples/_old_api_stack/models/rnn_model.py index c95b58d5e3a4..bdbc8b6a9c85 100644 --- a/rllib/examples/_old_api_stack/models/rnn_model.py +++ b/rllib/examples/_old_api_stack/models/rnn_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/rnn_spy_model.py b/rllib/examples/_old_api_stack/models/rnn_spy_model.py index fdf280f043f8..337990a60759 100644 --- a/rllib/examples/_old_api_stack/models/rnn_spy_model.py +++ b/rllib/examples/_old_api_stack/models/rnn_spy_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np import pickle diff --git a/rllib/examples/_old_api_stack/models/shared_weights_model.py b/rllib/examples/_old_api_stack/models/shared_weights_model.py index c7f44e00e060..28ad0896b18f 100644 --- a/rllib/examples/_old_api_stack/models/shared_weights_model.py +++ b/rllib/examples/_old_api_stack/models/shared_weights_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np from ray.rllib.models.modelv2 import ModelV2 diff --git a/rllib/examples/_old_api_stack/models/simple_rpg_model.py b/rllib/examples/_old_api_stack/models/simple_rpg_model.py index c96d24b29ee3..b37d915df8a1 100644 --- a/rllib/examples/_old_api_stack/models/simple_rpg_model.py +++ b/rllib/examples/_old_api_stack/models/simple_rpg_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as TFFCNet from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 diff --git a/rllib/examples/_old_api_stack/models/trajectory_view_utilizing_models.py b/rllib/examples/_old_api_stack/models/trajectory_view_utilizing_models.py index 1599d5c7ecfe..ed7e2919ede3 100644 --- a/rllib/examples/_old_api_stack/models/trajectory_view_utilizing_models.py +++ b/rllib/examples/_old_api_stack/models/trajectory_view_utilizing_models.py @@ -1,3 +1,4 @@ +# @OldAPIStack from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.torch.misc import SlimFC from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py index 697bd538300b..c049a2c9998b 100644 --- a/rllib/examples/_old_api_stack/parametric_actions_cartpole.py +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py @@ -1,3 +1,4 @@ +# @OldAPIStack """Example of handling variable length and/or parametric action spaces. This toy example demonstrates the action-embedding based approach for handling large diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py index 53e3a7e75f7c..2c48e7876d83 100644 --- a/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py @@ -1,3 +1,4 @@ +# @OldAPIStack """Example of handling variable length and/or parametric action spaces. This is a toy example of the action-embedding based approach for handling large diff --git a/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py b/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py index 40e7b575fbf6..c9a4758f81ea 100644 --- a/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py +++ b/rllib/examples/_old_api_stack/policy/cliff_walking_wall_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack import gymnasium as gym from typing import Dict, Union, List, Tuple, Optional import numpy as np diff --git a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py index 47ce9b92c884..0c00e3ebad86 100644 --- a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py +++ b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack import numpy as np import tree from gymnasium.spaces import Box diff --git a/rllib/examples/_old_api_stack/policy/memory_leaking_policy.py b/rllib/examples/_old_api_stack/policy/memory_leaking_policy.py index 9b813981ea48..3a5fa13ed509 100644 --- a/rllib/examples/_old_api_stack/policy/memory_leaking_policy.py +++ b/rllib/examples/_old_api_stack/policy/memory_leaking_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack import gymnasium as gym from ray.rllib.examples._old_api_stack.policy.random_policy import RandomPolicy diff --git a/rllib/examples/_old_api_stack/policy/random_policy.py b/rllib/examples/_old_api_stack/policy/random_policy.py index 2bc0a61f5913..c410ba0ec464 100644 --- a/rllib/examples/_old_api_stack/policy/random_policy.py +++ b/rllib/examples/_old_api_stack/policy/random_policy.py @@ -1,3 +1,4 @@ +# @OldAPIStack from gymnasium.spaces import Box import numpy as np import random diff --git a/rllib/examples/_old_api_stack/policy/rock_paper_scissors_dummies.py b/rllib/examples/_old_api_stack/policy/rock_paper_scissors_dummies.py index 06a22b53aaad..dbaa0e401038 100644 --- a/rllib/examples/_old_api_stack/policy/rock_paper_scissors_dummies.py +++ b/rllib/examples/_old_api_stack/policy/rock_paper_scissors_dummies.py @@ -1,3 +1,4 @@ +# @OldAPIStack import gymnasium as gym import numpy as np import random diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py index f9b2c43ba571..139cbbf6e045 100644 --- a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ This script demonstrates how to specify custom env APIs in combination with RLlib's `remote_worker_envs` setting, which diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index be4137c6aa95..c447bb4970af 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ This script demonstrates how to specify n (vectorized) envs as Ray remote (actors), such that stepping through these occurs in parallel. diff --git a/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py index 7c876ca5300b..fd4df54e06b3 100644 --- a/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py +++ b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example script on how to train, save, load, and test an RLlib agent. Equivalent script with stable baselines: sb2rllib_sb_example.py. diff --git a/rllib/examples/_old_api_stack/sb2rllib_sb_example.py b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py index 3812fea5420a..4f1be19c6a3b 100644 --- a/rllib/examples/_old_api_stack/sb2rllib_sb_example.py +++ b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example script on how to train, save, load, and test a stable baselines 2 agent. Code taken and adjusted from SB2 docs: diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index c9bab618fdf1..b89c7d6b23d1 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -1,3 +1,5 @@ +# @OldAPIStack + """Example showing how to use "action masking" in RLlib. "Action masking" allows the agent to select actions based on the current diff --git a/rllib/examples/algorithms/README.rst b/rllib/examples/algorithms/README.rst new file mode 100644 index 000000000000..a13c97302574 --- /dev/null +++ b/rllib/examples/algorithms/README.rst @@ -0,0 +1,5 @@ + +- `Custom Algorith.training_step() method combining on- and off-policy learning `__: + Example of how to override the :py:meth:`~ray.rllib.algorithms.algorithm.training_step` method of the + :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel + (also using multi-agent API). diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 0cb6a0df654c..7e66d499c044 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -1,3 +1,5 @@ +# @OldAPIStack + """Example of using a custom training workflow. This example creates a number of CartPole agents, some of which are trained with diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index af5963cff5d3..1a9ef8af8cb6 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """ Example of specifying an autoregressive action distribution. diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 5b4110bf1ff3..a154a73f088a 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -1,4 +1,5 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack + # TODO (sven): Move this script to `examples/rl_modules/...` import argparse diff --git a/rllib/examples/catalogs/README.rst b/rllib/examples/catalogs/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/catalogs/custom_action_distribution.py b/rllib/examples/catalogs/custom_action_distribution.py index a404dd36e7bb..6eb8aa234ea4 100644 --- a/rllib/examples/catalogs/custom_action_distribution.py +++ b/rllib/examples/catalogs/custom_action_distribution.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack (w/ EnvRunners). +# @HybridAPIStack """ This example shows two modifications: diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index beebdb79f773..56c43017d65c 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """ This example shows two modifications: diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index c01f27f23979..1505a01c0f21 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack # *********************************************************************************** # IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by diff --git a/rllib/examples/centralized_critic_2.py b/rllib/examples/centralized_critic_2.py index 36955cd46275..6a2392f96385 100644 --- a/rllib/examples/centralized_critic_2.py +++ b/rllib/examples/centralized_critic_2.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack # *********************************************************************************** # IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by diff --git a/rllib/examples/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoint_by_custom_criteria.py deleted file mode 100644 index b5fc9b057613..000000000000 --- a/rllib/examples/checkpoint_by_custom_criteria.py +++ /dev/null @@ -1,6 +0,0 @@ -msg = """ -This script has been moved to -`ray.rllib.examples.checkpoints.checkpoint_by_custom_criteria.py` -""" - -raise NotImplementedError(msg) diff --git a/rllib/examples/checkpoints/README.rst b/rllib/examples/checkpoints/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/checkpoints/cartpole_dqn_export.py b/rllib/examples/checkpoints/cartpole_dqn_export.py index 749cfe4305ea..48e73f15b6ae 100644 --- a/rllib/examples/checkpoints/cartpole_dqn_export.py +++ b/rllib/examples/checkpoints/cartpole_dqn_export.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack import numpy as np import os diff --git a/rllib/examples/checkpoints/onnx_tf.py b/rllib/examples/checkpoints/onnx_tf.py index f63847f117f8..b4a1264c2046 100644 --- a/rllib/examples/checkpoints/onnx_tf.py +++ b/rllib/examples/checkpoints/onnx_tf.py @@ -1,3 +1,4 @@ +# @OldAPIStack import argparse import numpy as np import onnxruntime diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index 77a1ffb5f28a..c377a5c65663 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack from packaging.version import Version import numpy as np diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index 9c4dc3805613..ede6a4d88d01 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Simple example of how to restore only one of n agents from a trained multi-agent Algorithm using Ray tune. diff --git a/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py b/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py index bcbcf515e15a..b7cc351be042 100644 --- a/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py +++ b/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py @@ -1,3 +1,5 @@ +# @OldAPIStack + """ Adapted (time-dependent) GAE for PPO algorithm can be activated by setting use_adapted_gae=True in the policy config. Additionally, it is required that diff --git a/rllib/examples/connectors/README.rst b/rllib/examples/connectors/README.rst new file mode 100644 index 000000000000..06010d452e41 --- /dev/null +++ b/rllib/examples/connectors/README.rst @@ -0,0 +1 @@ +TODO: add note about connectors vs connectorV2 \ No newline at end of file diff --git a/rllib/examples/curriculum/README.rst b/rllib/examples/curriculum/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/custom_metrics_and_callbacks.py index 83b705403f37..03a37d61323a 100644 --- a/rllib/examples/custom_metrics_and_callbacks.py +++ b/rllib/examples/custom_metrics_and_callbacks.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example of using RLlib's debug callbacks. diff --git a/rllib/examples/custom_model_api.py b/rllib/examples/custom_model_api.py index bbc484e0d396..e1e6705bbf77 100644 --- a/rllib/examples/custom_model_api.py +++ b/rllib/examples/custom_model_api.py @@ -1,3 +1,4 @@ +# @OldAPIStack import argparse from gymnasium.spaces import Box, Discrete import numpy as np diff --git a/rllib/examples/custom_model_loss_and_metrics.py b/rllib/examples/custom_model_loss_and_metrics.py index 324b3f6f4ee7..ccb3d8e1acd0 100644 --- a/rllib/examples/custom_model_loss_and_metrics.py +++ b/rllib/examples/custom_model_loss_and_metrics.py @@ -1,4 +1,5 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack + # Users should just inherit the Learner and extend the loss_fn. # TODO (sven): Move this example script to `examples/learners/...` diff --git a/rllib/examples/custom_recurrent_rnn_tokenizer.py b/rllib/examples/custom_recurrent_rnn_tokenizer.py index 4f99f6fdd8ec..59be6b31ea7e 100644 --- a/rllib/examples/custom_recurrent_rnn_tokenizer.py +++ b/rllib/examples/custom_recurrent_rnn_tokenizer.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example of define custom tokenizers for recurrent models in RLModules. diff --git a/rllib/examples/debugging/README.rst b/rllib/examples/debugging/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index ce42bc378235..af3c383f3727 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """ Example of a fully deterministic, repeatable RLlib train run using diff --git a/rllib/examples/envs/README.rst b/rllib/examples/envs/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py index 162aa5f39d0d..5af971ad23fb 100644 --- a/rllib/examples/envs/greyscale_env.py +++ b/rllib/examples/envs/greyscale_env.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example of interfacing with an environment that produces 2D observations. diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py index 91f7ea6ecb55..40350a8c5853 100644 --- a/rllib/examples/envs/unity3d_env_local.py +++ b/rllib/examples/envs/unity3d_env_local.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """ Example of running an RLlib Algorithm against a locally running Unity3D editor diff --git a/rllib/examples/evaluation/README.rst b/rllib/examples/evaluation/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/gpus/README.rst b/rllib/examples/gpus/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/hierarchical/README.rst b/rllib/examples/hierarchical/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/hierarchical/hierarchical_training.py b/rllib/examples/hierarchical/hierarchical_training.py index b7b02d76fffc..924aa5de2f07 100644 --- a/rllib/examples/hierarchical/hierarchical_training.py +++ b/rllib/examples/hierarchical/hierarchical_training.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example of hierarchical training using the multi-agent API. diff --git a/rllib/examples/inference/README.rst b/rllib/examples/inference/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py index 91b85ecf48ce..90c5eeba3dab 100644 --- a/rllib/examples/inference/policy_inference_after_training.py +++ b/rllib/examples/inference/policy_inference_after_training.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example showing how you can use your trained policy for inference (computing actions) in an environment. diff --git a/rllib/examples/inference/policy_inference_after_training_with_attention.py b/rllib/examples/inference/policy_inference_after_training_with_attention.py index 97f5f2ecded8..a3166c3d4761 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_attention.py +++ b/rllib/examples/inference/policy_inference_after_training_with_attention.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example showing how you can use your trained policy for inference (computing actions) in an environment. diff --git a/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/rllib/examples/inference/policy_inference_after_training_with_lstm.py index 7bee98e132f3..a7dc5ada6f3c 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_lstm.py +++ b/rllib/examples/inference/policy_inference_after_training_with_lstm.py @@ -1,3 +1,4 @@ +# @OldAPIStack """ Example showing how you can use your trained policy for inference (computing actions) in an environment. diff --git a/rllib/examples/learners/README.rst b/rllib/examples/learners/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/multi_agent/README.rst b/rllib/examples/multi_agent/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py index 589eaa8f80e9..f77c6d0d5c3b 100644 --- a/rllib/examples/multi_agent/two_algorithms.py +++ b/rllib/examples/multi_agent/two_algorithms.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example of using two different training methods at once in multi-agent. diff --git a/rllib/examples/offline_rl/README.rst b/rllib/examples/offline_rl/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/offline_rl/custom_input_api.py b/rllib/examples/offline_rl/custom_input_api.py index 660e1982a015..bd192184155f 100644 --- a/rllib/examples/offline_rl/custom_input_api.py +++ b/rllib/examples/offline_rl/custom_input_api.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example of creating a custom input api diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index f3148cbbeef0..cd893013321a 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Example on how to use CQL to learn from an offline json file. diff --git a/rllib/examples/offline_rl/saving_experiences.py b/rllib/examples/offline_rl/saving_experiences.py index 168297cc297d..27c76c264da9 100644 --- a/rllib/examples/offline_rl/saving_experiences.py +++ b/rllib/examples/offline_rl/saving_experiences.py @@ -1,4 +1,4 @@ -# TODO (sven): Move this example script into the new API stack. +# @OldAPIStack """Simple example of writing experiences to a file using JsonWriter.""" diff --git a/rllib/examples/ray_serve/README.rst b/rllib/examples/ray_serve/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/ray_tune/README.rst b/rllib/examples/ray_tune/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/replay_buffer_api.py b/rllib/examples/replay_buffer_api.py index c0ce3939bd92..5d87a5ef5cd3 100644 --- a/rllib/examples/replay_buffer_api.py +++ b/rllib/examples/replay_buffer_api.py @@ -1,4 +1,4 @@ -# Move this example to the new API stack. +# @OldAPIStack # __sphinx_doc_replay_buffer_api_example_script_begin__ """Simple example of how to modify replay buffer behaviour. diff --git a/rllib/examples/rl_modules/README.rst b/rllib/examples/rl_modules/README.rst new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/rl_modules/action_masking_rlm.py b/rllib/examples/rl_modules/action_masking_rlm.py deleted file mode 100644 index 68bde8c8a8f2..000000000000 --- a/rllib/examples/rl_modules/action_masking_rlm.py +++ /dev/null @@ -1,6 +0,0 @@ -msg = """ -This script has been moved to -`ray.rllib.examples.rl_modules.classes.action_masking_rlm.py` -""" - -raise NotImplementedError(msg) diff --git a/rllib/examples/rl_modules/episode_env_aware_rlm.py b/rllib/examples/rl_modules/episode_env_aware_rlm.py deleted file mode 100644 index 9cafd034ec0b..000000000000 --- a/rllib/examples/rl_modules/episode_env_aware_rlm.py +++ /dev/null @@ -1,6 +0,0 @@ -msg = """ -This script has been moved to -`ray.rllib.examples.rl_modules.classes.random_rlm.py::StatefulRandomRLModule` -""" - -raise NotImplementedError(msg) diff --git a/rllib/examples/rl_modules/frame_stacking_rlm.py b/rllib/examples/rl_modules/frame_stacking_rlm.py deleted file mode 100644 index 4ed592fa8705..000000000000 --- a/rllib/examples/rl_modules/frame_stacking_rlm.py +++ /dev/null @@ -1,12 +0,0 @@ -msg = """ -This script has been taken out of RLlib b/c: -- This script used `ViewRequirements` ("Trajectory View API") to set up the RLModule, -however, this API will not be part of the new API stack. -Instead, you can use RLlib's built-in ConnectorV2 for frame stacking (or write a custom -ConnectorV2). Take a look at this example script here, which shows how you can do frame- -stacking with RLlib's new ConnectorV2 API. - -`ray.rllib.examples.connectors.frame_stacking.py` -""" - -raise NotImplementedError(msg) diff --git a/rllib/examples/rl_modules/mobilenet_rlm.py b/rllib/examples/rl_modules/mobilenet_rlm.py deleted file mode 100644 index 84f57d0566e0..000000000000 --- a/rllib/examples/rl_modules/mobilenet_rlm.py +++ /dev/null @@ -1,6 +0,0 @@ -msg = """ -This script has been moved to -`ray.rllib.examples.rl_modules.classes.mobilenet_rlm.py` -""" - -raise NotImplementedError(msg) diff --git a/rllib/examples/rl_modules/random_rl_module.py b/rllib/examples/rl_modules/random_rl_module.py deleted file mode 100644 index eac2d59ddf61..000000000000 --- a/rllib/examples/rl_modules/random_rl_module.py +++ /dev/null @@ -1,6 +0,0 @@ -msg = """ -This script has been moved to -`ray.rllib.examples.rl_modules.classes.random_rlm.py` -""" - -raise NotImplementedError(msg) From d65082fc738aed2036048b0781cdc34ae54acfa6 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 16 May 2024 15:05:46 +0200 Subject: [PATCH 10/23] wip Signed-off-by: sven1977 --- .../sigils/rllib-sigil-new-api-stack.svg | 1 + doc/source/rllib/rllib-advanced-api.rst | 102 ++++-------------- rllib/examples/README.rst | 32 ++++-- rllib/examples/curriculum/README.rst | 12 +++ .../curriculum/curriculum_learning.py | 5 +- 5 files changed, 61 insertions(+), 91 deletions(-) create mode 100644 doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg diff --git a/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg b/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg new file mode 100644 index 000000000000..15c9e494533c --- /dev/null +++ b/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/rllib/rllib-advanced-api.rst b/doc/source/rllib/rllib-advanced-api.rst index 8dd16f271217..0dfbe7c9139f 100644 --- a/doc/source/rllib/rllib-advanced-api.rst +++ b/doc/source/rllib/rllib-advanced-api.rst @@ -19,87 +19,31 @@ implement `custom training workflows (example) `__ environment API. -Your environment only needs to implement the `set_task` and `get_task` methods -for this to work. You can then define an `env_task_fn` in your config, -which receives the last training results and returns a new task for the env to be set to: - -.. TODO move to doc_code and make it use algo configs. -.. code-block:: python - - from ray.rllib.env.apis.task_settable_env import TaskSettableEnv - - class MyEnv(TaskSettableEnv): - def get_task(self): - return self.current_difficulty - - def set_task(self, task): - self.current_difficulty = task - - def curriculum_fn(train_results, task_settable_env, env_ctx): - # Very simple curriculum function. - current_task = task_settable_env.get_task() - new_task = current_task + 1 - return new_task - - # Setup your Algorithm's config like so: - config = { - "env": MyEnv, - "env_task_fn": curriculum_fn, - } - # Train using `Tuner.fit()` or `Algorithm.train()` and the above config stub. - # ... - -There are two more ways to use the RLlib's other APIs to implement -`curriculum learning `__. - -Use the Algorithm API and update the environment between calls to ``train()``. -This example shows the algorithm being run inside a Tune function. -This is basically the same as what the built-in `env_task_fn` API described above -already does under the hood, but allows you to do even more customizations to your -training loop. - -.. TODO move to doc_code and make it use algo configs. -.. code-block:: python - - import ray - from ray import train, tune - from ray.rllib.algorithms.ppo import PPO - - def train_fn(config): - algo = PPO(config=config, env=YourEnv) - while True: - result = algo.train() - train.report(result) - if result["episode_reward_mean"] > 200: - task = 2 - elif result["episode_reward_mean"] > 100: - task = 1 - else: - task = 0 - algo.workers.foreach_worker( - lambda ev: ev.foreach_env( - lambda env: env.set_task(task))) - - num_gpus = 0 - num_env_runners = 2 +In curriculum learning, the environment can be set by the user to different difficulties +throughout the training process. This allows the algorithm to learn how to solve +the actual (final) problem incrementally, by interacting with and exploring in more and +more difficult phases. +Normally, such a curriculum starts with setting the environment to an easy difficulty and +then - as training progresses - transitions more and more toward a harder-to-solve difficulty. +`See this blog post here `__ +for another example of how curriculum learning can be done. + +RLlib's Algorithm and custom callbacks APIs allow for implementing any arbitrary +curricula. We will quickly touch on the `example script found here <>`__ to introduce +the basic concepts needed, then refer you + +First, we define some env options. We will work with the `FrozenLake-v1` environment, +a grid world, whose map is fully customizable. Our three tasks (different env difficulties) +are represented by slightly different maps that our agent will have to navigate. + +.. literalinclude:: /../../rllib/examples/curriculum/curriculum_learning.py + :language: python + :start-after: __curriculum_learning_example_env_options__ + :end-before: __END_curriculum_learning_example_env_options__ - ray.init() - tune.Tuner( - tune.with_resources(train_fn, resources=tune.PlacementGroupFactory( - [{"CPU": 1}, {"GPU": num_gpus}] + [{"CPU": 1}] * num_env_runners - ),) - param_space={ - "num_gpus": num_gpus, - "num_env_runners": num_env_runners, - }, - ).fit() +Then, we define the central piece controlling the curriculum, which is a custom callbacks class +overriding the :py:meth:`~ray.rllib.algorithms.callbacks.Callbacks.on_train_result` -You could also use RLlib's callbacks API to update the environment on new training -results: .. TODO move to doc_code and make it use algo configs. .. code-block:: python diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst index 79d191280407..1fa59358035c 100644 --- a/rllib/examples/README.rst +++ b/rllib/examples/README.rst @@ -58,70 +58,85 @@ Catalogs Checkpoints ----------- -.. include:: catalogs/README.rst +.. include:: checkpoints/README.rst Connectors ---------- .. include:: connectors/README.rst - Curriculum Learning ------------------- +.. include:: curriculum/README.rst Debugging --------- +.. include:: debugging/README.rst Environments ------------ +.. include:: envs/README.rst + Evaluation ---------- +.. include:: evaluation/README.rst + GPU (for Training and Sampling) ------------------------------- +.. include:: gpus/README.rst Hierarchical Training --------------------- +.. include:: hierarchical/README.rst Inference (of Models/Policies) ------------------------------ +.. include:: inference/README.rst Learners -------- +.. include:: learners/README.rst Multi-Agent RL -------------- +.. include:: multi_agent/README.rst + Offline RL ---------- +.. include:: offline/README.rst + Ray Serve and RLlib ------------------- +.. include:: ray_serve/README.rst + Ray Tune and RLlib ------------------ +.. include:: ray_tune/README.rst + RLModules --------- +.. include:: rl_modules/README.rst +Tuned Examples +++++++++++++++ -TODO: Link to tuned_examples, which should have their own README.rst - -Tuned Examples --------------- - - `Tuned examples `__: Collection of tuned hyperparameters sorted by algorithm. @@ -167,9 +182,6 @@ Training Workflows - `Custom training workflows `__: Example of how to use Tune's support for custom training functions to implement custom training workflows. -- `Curriculum learning with the TaskSettableEnv API `__: - Example of how to advance the environment through different phases (tasks) over time. - Also see the `curriculum learning how-to `__ from the documentation here. - `Custom logger `__: How to setup a custom Logger object in RLlib. - `Custom metrics `__: diff --git a/rllib/examples/curriculum/README.rst b/rllib/examples/curriculum/README.rst index e69de29bb2d1..bf88a730e83c 100644 --- a/rllib/examples/curriculum/README.rst +++ b/rllib/examples/curriculum/README.rst @@ -0,0 +1,12 @@ + +.. |new_stack| image:: ../../../doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg + :class: inline-figure + :width: 40 + +- |new_stack| `Curriculum learning with the custom callbacks API `__: + Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) + and thus help the learning algorithm to cope with an otherwise unsolvable task. + Also see the :doc:`curriculum learning how-to ` from the documentation. + + + diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 02916ed459df..96539e7c2ec9 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -1,4 +1,4 @@ -"""Example of using an env-task curriculum via implementing a custom callback. +"""Example of using an env-task curriculum by implementing a custom callback. This example: - demonstrates how to define your own curriculum-capable environments using @@ -89,7 +89,7 @@ "hardest task right away).", ) - +# __curriculum_learning_example_env_options__ ENV_OPTIONS = { "is_slippery": False, # Limit the number of steps the agent is allowed to make in the env to @@ -133,6 +133,7 @@ "FHFFFFFG", ], ] +# __END_curriculum_learning_example_env_options__ # Simple function sent to an EnvRunner to change the map of all its gym.Envs from From 5d1fa2174899197a6b4106c860e270435cd14a1f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 4 Jun 2024 07:30:40 +0200 Subject: [PATCH 11/23] wip Signed-off-by: sven1977 --- rllib/examples/README.rst | 19 +++++++++---------- rllib/examples/gpus/README.rst | 6 ++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst index 1fa59358035c..1d9902495102 100644 --- a/rllib/examples/README.rst +++ b/rllib/examples/README.rst @@ -2,15 +2,15 @@ Examples ======== This page contains an index of all the python scripts in the `examples/ folder ` -of RLlib demonstrating the different use cases and features of the library. +of RLlib, demonstrating the different use cases and features of the library. .. note:: RLlib is currently in a transition state from "old API stack" to "new API stack". Some of the examples here haven't been translated yet to the new stack and are tagged - with the following comment line on top: `# @OldAPIStack`. We are in the - process of moving all example scripts over to the "new API stack" and expect to complete - this effort in the near future. + with the following comment line on top: ``# @OldAPIStack``. The moving of all example + scripts over to the "new API stack" is work in progress and expected to be completed + by the end of 2024. .. note:: @@ -27,7 +27,7 @@ structured into several sub-directories, the contents of all of which are descri How to run an example script ++++++++++++++++++++++++++++ -Most of the example scripts are self-executable, meaning you can just cd into the respective +Most of the example scripts are self-executable, meaning you can just ``cd`` into the respective directory and type: .. code-block:: bash @@ -36,9 +36,10 @@ directory and type: $ python multi_agent_pendulum.py --enable-new-api-stack --num-agents=2 -Use the `--help` command line arg to have each script print out its specific command line options. -However, most of the scripts share a common subset of generally applicable command line args, for example -`--num-env-runners`, `--no-tune`, or `--wandb-key`. +Use the `--help` command line argument to have each script print out its supported command line options. + +Most of the scripts share a common subset of generally applicable command line arguments, +for example `--num-env-runners`, `--no-tune`, or `--wandb-key`. All sub-folders @@ -131,12 +132,10 @@ RLModules .. include:: rl_modules/README.rst - Tuned Examples ++++++++++++++ - - `Tuned examples `__: Collection of tuned hyperparameters sorted by algorithm. diff --git a/rllib/examples/gpus/README.rst b/rllib/examples/gpus/README.rst index e69de29bb2d1..18b254f09b55 100644 --- a/rllib/examples/gpus/README.rst +++ b/rllib/examples/gpus/README.rst @@ -0,0 +1,6 @@ + +- `Using fractional GPUs for training your model `__: + If your model is small and easily fits on a single GPU and you want to therefore train + other models alongside it to save time and cost, this script shows you how to set up + your RLlib config with a fractional number of GPUs on the learner (model training) + side. From d0995ae94d76f37dedb06dd0841eb2685a2de59c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 10 Jun 2024 12:21:20 +0200 Subject: [PATCH 12/23] wip Signed-off-by: sven1977 --- rllib/examples/README.rst | 71 +++++++++++++++------------- rllib/examples/algorithms/README.rst | 5 +- rllib/examples/connectors/README.rst | 35 +++++++++++++- 3 files changed, 75 insertions(+), 36 deletions(-) diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst index 1d9902495102..4f7d020b30c7 100644 --- a/rllib/examples/README.rst +++ b/rllib/examples/README.rst @@ -1,3 +1,8 @@ +.. |newstack| image:: images/tensorflow.png + :class: inline-figure + :width: 16 + + Examples ======== @@ -148,9 +153,9 @@ Tuned Examples Environments and Adapters ------------------------- -- `Registering a custom env and model `__: +- |oldstack| `Registering a custom env and model `__: Example of defining and registering a gym env and model for use with RLlib. -- `Local Unity3D multi-agent environment example `__: +- |oldstack| `Local Unity3D multi-agent environment example `__: Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to learn any Unity3D game (including support for multi-agent). Use this example to try things out and watch the game and the learning progress live in the editor. @@ -160,19 +165,19 @@ Environments and Adapters Custom- and Complex Models -------------------------- -- `Custom Keras model `__: +- |oldstack| `Custom Keras model `__: Example of using a custom Keras model. -- `Registering a custom model with supervised loss `__: +- |oldstack| `Registering a custom model with supervised loss `__: Example of defining and registering a custom model with a supervised loss. -- `Batch normalization `__: +- |oldstack| `Batch normalization `__: Example of adding batch norm layers to a custom model. -- `Custom model API example `__: +- |oldstack| `Custom model API example `__: Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. -- `Trajectory View API utilizing model `__: +- |oldstack| `Trajectory View API utilizing model `__: An example on how a model can use the trajectory view API to specify its own input. -- `MobileNetV2 wrapping example model `__: +- |oldstack| `MobileNetV2 wrapping example model `__: Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. -- `Differentiable Neural Computer `__: +- |oldstack| `Differentiable Neural Computer `__: Example of DeepMind's Differentiable Neural Computer for partially observable environments. @@ -248,63 +253,61 @@ Multi-Agent and Hierarchical Special Action- and Observation Spaces -------------------------------------- -- `Nested action spaces `__: - Learning in arbitrarily nested action spaces. -- `Parametric actions `__: +- |newstack| `Autoregressive action distribution example `__: + Learning with an auto-regressive action distribution (for example, two action components, where distribution of the second component depends on the first's actually sampled value). +- |oldstack| `Parametric actions `__: Example of how to handle variable-length or parametric action spaces. -- `Using the "Repeated" space of RLlib for variable lengths observations `__: +- |oldstack| `Using the "Repeated" space of RLlib for variable lengths observations `__: How to use RLlib's `Repeated` space to handle variable length observations. -- `Autoregressive action distribution example `__: - Learning with auto-regressive action dependencies (for example two action components, where distribution for the second component depends on the first's actually sampled value). Community Examples ------------------ -- `Arena AI `__: +- |oldstack| `Arena AI `__: A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence with RLlib-generated baselines. -- `CARLA `__: +- |oldstack| `CARLA `__: Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. -- `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: +- |oldstack| `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). -- `Flatland `__: +- |oldstack| `Flatland `__: A dense traffic simulating environment with RLlib-generated baselines. -- `GFootball `__: +- |oldstack| `GFootball `__: Example of setting up a multi-agent version of `GFootball `__ with RLlib. -- `mobile-env `__: +- |oldstack| `mobile-env `__: An open, minimalist Gymnasium environment for autonomous coordination in wireless mobile networks. Includes an example notebook using Ray RLlib for multi-agent RL with mobile-env. -- `Neural MMO `__: +- |oldstack| `Neural MMO `__: A multiagent AI research environment inspired by Massively Multiplayer Online (MMO) role playing games – self-contained worlds featuring thousands of agents per persistent macrocosm, diverse skilling systems, local and global economies, complex emergent social structures, and ad-hoc high-stakes single and team based conflict. -- `NeuroCuts `__: +- |oldstack| `NeuroCuts `__: Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. -- `NeuroVectorizer `__: +- |oldstack| `NeuroVectorizer `__: Example of learning optimal LLVM vectorization compiler pragmas for loops in C and C++ codes using RLlib. -- `Roboschool / SageMaker `__: +- |oldstack| `Roboschool / SageMaker `__: Example of training robotic control policies in SageMaker with RLlib. -- `Sequential Social Dilemma Games `__: +- |oldstack| `Sequential Social Dilemma Games `__: Example of using the multi-agent API to model several `social dilemma games `__. -- `Simple custom environment for single RL with Ray and RLlib `__: +- |oldstack| `Simple custom environment for single RL with Ray and RLlib `__: Create a custom environment and train a single agent RL using Ray 2.0 with Tune. -- `StarCraft2 `__: +- |oldstack| `StarCraft2 `__: Example of training in StarCraft2 maps with RLlib / multi-agent. -- `Traffic Flow `__: +- |oldstack| `Traffic Flow `__: Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. Blog Posts ---------- -- `Attention Nets and More with RLlib’s Trajectory View API `__: +- |oldstack| `Attention Nets and More with RLlib’s Trajectory View API `__: Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. -- `Reinforcement Learning with RLlib in the Unity Game Engine `__: +- |oldstack| `Reinforcement Learning with RLlib in the Unity Game Engine `__: How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. -- `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: +- |oldstack| `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. -- `Scaling Multi-Agent Reinforcement Learning `__: +- |oldstack| `Scaling Multi-Agent Reinforcement Learning `__: Blog post of a brief tutorial on multi-agent RL and its design in RLlib. -- `Functional RL with Keras and TensorFlow Eager `__: +- |oldstack| `Functional RL with Keras and TensorFlow Eager `__: Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. diff --git a/rllib/examples/algorithms/README.rst b/rllib/examples/algorithms/README.rst index a13c97302574..fc6f009170de 100644 --- a/rllib/examples/algorithms/README.rst +++ b/rllib/examples/algorithms/README.rst @@ -1,5 +1,8 @@ +.. |newstack| image:: ../../doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg + :class: inline-figure + :width: 32 -- `Custom Algorith.training_step() method combining on- and off-policy learning `__: +- |newstack| `Custom Algorith.training_step() method combining on- and off-policy learning `__: Example of how to override the :py:meth:`~ray.rllib.algorithms.algorithm.training_step` method of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel (also using multi-agent API). diff --git a/rllib/examples/connectors/README.rst b/rllib/examples/connectors/README.rst index 06010d452e41..119100eeb9ba 100644 --- a/rllib/examples/connectors/README.rst +++ b/rllib/examples/connectors/README.rst @@ -1 +1,34 @@ -TODO: add note about connectors vs connectorV2 \ No newline at end of file + +.. note:: + RLlib's Connector API has been re-written from scratch for the new API stack (|newstack|). + We are now referring to connector-pieces and -pipelines as :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` + (as opposed to ``Connector``, which continue to work on the old API stack |oldstack|). + + +- |newstack| `Atari image frame stacking `__: + An example using Atari framestacking in a very efficient manner, NOT in the environment itself (as a `gym.Wrapper`), + but by stacking the observations on-the-fly using `EnvToModule` and `LearnerConnector` pipelines. + This method of framestacking is more efficient as it avoids having to send large observation + tensors through the network (ray). + +- |newstack| `Mean/STD filtering of observations `__: + An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that filters all observations from the environment using a + plain mean/STD filter (i.e. shift by mean and divide by std-dev). This example demonstrates + how a stateful :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` class has its states + (here the means and std's of the individual observation items) coming from the different + :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances a) merged into one common state and + then b) broadcast again back to the remote :py:class:`~ray.rllib.env.env_runner.EnvRunner` workers. + +- |newstack| `Include previous-action(s) and/or previous reward(s) in RLModule inputs `__: + An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that adds the n previous action(s) + and/or the m previous reward(s) to the RLModule's input dict (to perform its forward passes, both + for inference and training). + +- |newstack| `Nested action spaces `__: + Learning in arbitrarily nested action spaces, using an env in which the action space equals the + observation space (both are complex, nested Dicts) and the policy has to pick actions + that closely match (or are identical) to the previously seen observations. + +- |newstack| `Nested observation spaces `__: + Learning in arbitrarily nested observation spaces + (using a CartPole-v1 variant with a nested Dict observation space). From 0916ce7d8119ba40fc1d2b17d3aace922b79db8b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 10 Jun 2024 14:33:25 +0200 Subject: [PATCH 13/23] wip Signed-off-by: sven1977 --- .../rllib/images/sigils/new-api-stack.svg | 1 + .../rllib/images/sigils/old-api-stack.svg | 1 + .../sigils/rllib-sigil-new-api-stack.svg | 1 - doc/source/rllib/rllib-examples.rst | 339 +++++++++++++++++- rllib/BUILD | 6 +- rllib/examples/README.rst | 313 ---------------- rllib/examples/algorithms/README.rst | 8 - rllib/examples/catalogs/README.rst | 0 rllib/examples/checkpoints/README.rst | 0 rllib/examples/connectors/README.rst | 34 -- rllib/examples/curriculum/README.rst | 12 - rllib/examples/debugging/README.rst | 0 rllib/examples/envs/README.rst | 0 rllib/examples/evaluation/README.rst | 0 rllib/examples/gpus/README.rst | 6 - rllib/examples/hierarchical/README.rst | 0 rllib/examples/inference/README.rst | 0 rllib/examples/learners/README.rst | 0 .../custom_metrics_and_callbacks.py | 0 rllib/examples/multi_agent/README.rst | 0 rllib/examples/offline_rl/README.rst | 0 rllib/examples/ray_serve/README.rst | 0 rllib/examples/ray_tune/README.rst | 0 rllib/examples/rl_modules/README.rst | 0 ...ning_single_agent_training_multi_agent.py} | 0 25 files changed, 343 insertions(+), 378 deletions(-) create mode 100644 doc/source/rllib/images/sigils/new-api-stack.svg create mode 100644 doc/source/rllib/images/sigils/old-api-stack.svg delete mode 100644 doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg delete mode 100644 rllib/examples/README.rst delete mode 100644 rllib/examples/algorithms/README.rst delete mode 100644 rllib/examples/catalogs/README.rst delete mode 100644 rllib/examples/checkpoints/README.rst delete mode 100644 rllib/examples/connectors/README.rst delete mode 100644 rllib/examples/curriculum/README.rst delete mode 100644 rllib/examples/debugging/README.rst delete mode 100644 rllib/examples/envs/README.rst delete mode 100644 rllib/examples/evaluation/README.rst delete mode 100644 rllib/examples/gpus/README.rst delete mode 100644 rllib/examples/hierarchical/README.rst delete mode 100644 rllib/examples/inference/README.rst delete mode 100644 rllib/examples/learners/README.rst rename rllib/examples/{ => metrics}/custom_metrics_and_callbacks.py (100%) delete mode 100644 rllib/examples/multi_agent/README.rst delete mode 100644 rllib/examples/offline_rl/README.rst delete mode 100644 rllib/examples/ray_serve/README.rst delete mode 100644 rllib/examples/ray_tune/README.rst delete mode 100644 rllib/examples/rl_modules/README.rst rename rllib/examples/rl_modules/{pretraining_single_agent_training_multi_agent_rlm.py => pretraining_single_agent_training_multi_agent.py} (100%) diff --git a/doc/source/rllib/images/sigils/new-api-stack.svg b/doc/source/rllib/images/sigils/new-api-stack.svg new file mode 100644 index 000000000000..ec8c5a035279 --- /dev/null +++ b/doc/source/rllib/images/sigils/new-api-stack.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/rllib/images/sigils/old-api-stack.svg b/doc/source/rllib/images/sigils/old-api-stack.svg new file mode 100644 index 000000000000..7c57ef12c9bd --- /dev/null +++ b/doc/source/rllib/images/sigils/old-api-stack.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg b/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg deleted file mode 100644 index 15c9e494533c..000000000000 --- a/doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index 4671cc601736..2c0ebbc279ab 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -2,4 +2,341 @@ .. include:: /_includes/rllib/new_api_stack.rst -.. include:: ../../../rllib/examples/README.rst +.. |new_stack| image:: /rllib/images/sigils/new-api-stack.svg + :class: inline-figure + :width: 32 + +.. |old_stack| image:: /rllib/images/sigils/old-api-stack.svg + :class: inline-figure + :width: 32 + + +Examples +======== + +This page contains an index of all the python scripts in the `examples/ folder ` +of RLlib, demonstrating the different use cases and features of the library. + +.. note:: + + RLlib is currently in a transition state from "old API stack" to "new API stack". + Some of the examples here haven't been translated yet to the new stack and are tagged + with the following comment line on top: ``# @OldAPIStack``. The moving of all example + scripts over to the "new API stack" is work in progress and expected to be completed + by the end of 2024. + +.. note:: + + If any new-API-stack example is broken, or if you'd like to add an example to this page, + feel free to raise an issue on RLlib's `github repository `. + + +Folder Structure +++++++++++++++++ +The `examples/ folder ` is +structured into several sub-directories, the contents of all of which are described in detail below. + + +How to run an example script +++++++++++++++++++++++++++++ + +Most of the example scripts are self-executable, meaning you can just ``cd`` into the respective +directory and type: + +.. code-block:: bash + + $ cd examples/multi_agent + $ python multi_agent_pendulum.py --enable-new-api-stack --num-agents=2 + + +Use the `--help` command line argument to have each script print out its supported command line options. + +Most of the scripts share a common subset of generally applicable command line arguments, +for example `--num-env-runners`, `--no-tune`, or `--wandb-key`. + + +All sub-folders ++++++++++++++++ + + +Algorithms +---------- + +- |new_stack| `Custom Algorith.training_step() method combining on- and off-policy learning `__: + Example of how to override the :py:meth:`~ray.rllib.algorithms.algorithm.training_step` method of the + :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel + (also using multi-agent API). + +Catalogs +-------- + + + +Checkpoints +----------- + +- |new_stack| `How to extract a checkpoint from n Tune trials using one or more custom criteria. `__: + Example of how to find a :ref:`checkpoint ` after a `Tuner.fit()` with some custom defined criteria. + + +Connectors +---------- + +.. note:: + RLlib's Connector API has been re-written from scratch for the new API stack (|new_stack|). + Connector-pieces and -pipelines are now referred to as :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` + (as opposed to ``Connector``, which only continue to work on the old API stack |old_stack|). + + +- |new_stack| `How to frame-stack Atari image observations `__: + An example using Atari framestacking in a very efficient manner, not in the environment itself (as a `gym.Wrapper`), + but by stacking the observations on-the-fly using `EnvToModule` and `LearnerConnector` pipelines. + This method of framestacking is more efficient as it avoids having to send large observation + tensors through the network (ray). + +- |new_stack| `How to mean/std-filter observations `__: + An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that filters all observations from the environment using a + plain mean/std-filter (that is shift by mean and divide by std-dev). This example demonstrates + how a stateful :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` class has its states + (here the means and standard deviations of the individual observation items) coming from the different + :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances a) merged into one common state and + then b) broadcast again back to the remote :py:class:`~ray.rllib.env.env_runner.EnvRunner` workers. + +- |new_stack| `How to include previous-actions and/or previous rewards in RLModule inputs `__: + An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that adds the n previous actions + and/or the m previous rewards to the RLModule's input dict (to perform its forward passes, both + for inference and training). + +- |new_stack| `How to train with nested action spaces `__: + Learning in arbitrarily nested action spaces, using an env in which the action space equals the + observation space (both are complex, nested Dicts) and the policy has to pick actions + that closely match (or are identical) to the previously seen observations. + +- |new_stack| `How to train with nested observation spaces `__: + Learning in arbitrarily nested observation spaces + (using a CartPole-v1 variant with a nested Dict observation space). + +Curriculum Learning +------------------- + +- |new_stack| `How to set up curriculum learning with the custom callbacks API `__: + Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) + and thus help the learning algorithm to cope with an otherwise unsolvable task. + Also see the :doc:`curriculum learning how-to ` from the documentation. + +Debugging +--------- + + +Environments +------------ +- |new_stack| `How to register a custom gymnasium environment `__: + Example showing how to write your own RL environment using ``gymnasium`` and register it to run train your algorithm against this env with RLlib. + +- |new_stack| `How to set up rendering (and recording) of the environment trajectories during training with WandB `__: + Example showing how you can render and record episode trajectories of your gymnasium envs and log the videos to WandB. + +- |old_stack| `Local Unity3D multi-agent environment example `__: + Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to + learn any Unity3D game (including support for multi-agent). + Use this example to try things out and watch the game and the learning progress live in the editor. + Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. + For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. + +Evaluation +---------- + +- |new_stack| `How to run evaluation with a custom evaluation function `__: + Example of how to write a custom evaluation function that's called instead of the default behavior, which is running with the evaluation worker set through n episodes. + +- |new_stack| `How to run evaluation in parallel to training `__: + Example showing how the evaluation workers and the "normal" rollout workers can run (to some extend) in parallel to speed up training. + +GPU (for Training and Sampling) +------------------------------- + +- |new_stack| `How to use fractional GPUs for training an RLModule `__: + If your model is small and easily fits on a single GPU and you want to therefore train + other models alongside it to save time and cost, this script shows you how to set up + your RLlib config with a fractional number of GPUs on the learner (model training) + side. + +Hierarchical Training +--------------------- + +- |old_stack| `How to setup hierarchical training `__: + Example of hierarchical training using the multi-agent API. + +Inference (of Models/Policies) +------------------------------ + +- + + +Metrics +------- + +- |old_stack| `How to write your own custom metrics and callbacks in RLlib `__: + Example of how to output custom training metrics to TensorBoard. + + +Multi-Agent RL +-------------- + +- |new_stack| `How to set up independent multi-agent training `__: + Set up RLlib to run any algorithm in (independent) multi-agent mode against a multi-agent environment. +- |new_stack| `How to set up shared-parameter multi-agent training `__: + Set up RLlib to run any algorithm in (shared-parameter) multi-agent mode against a multi-agent environment. +- |new_stack| `How to compare a heuristic policy vs a trained one on rock-paper-scissors `__ and `Rock-paper-scissors learned vs learned `__: + Two examples of different heuristic and learned policies competing against each other in the rock-paper-scissors environment. +- |new_stack| `How to use agent grouping in a multi-agent environment (two-step game) `__: + Example on how to use agent grouping in a multi-agent environment (the two-step game from the `QMIX paper `__). +- |new_stack| `How to set up multi-agent training vs a PettingZoo environment `__: + Example on how to use RLlib to learn in `PettingZoo `__ multi-agent environments. +- |new_stack| `How to hand-code a (heuristic) policy `__: + Example of running a custom hand-coded policy alongside trainable policies. +- |new_stack| `How to train a single policy (weight sharing) controlling more than one agents `__: + Example of how to define weight-sharing layers between two different policies. + +- |old_stack| `PPO with centralized critic on two-step game `__: + Example of customizing PPO to leverage a centralized value function. +- |old_stack| `Centralized critic in the env `__: + A simpler method of implementing a centralized critic by augmenting agent observations with global information. +- |old_stack| `Multiple algorithms `__: + Example of alternating training between DQN and PPO. + +Offline RL +---------- + +- |old_stack| `Offline RL with CQL `__: + Example showing how to run an offline RL training job using a historic-data JSON file. + + +Ray Serve and RLlib +------------------- + +- |new_stack| `How to use a trained RLlib algorithm with Ray Serve `__ + This script offers a simple workflow for 1) training a policy with RLlib first, 2) creating a new policy 3) restoring its weights from the trained + one and serving the new policy with Ray Serve. + +Ray Tune and RLlib +------------------ +- |new_stack| `How to define a custom progress reporter and use it with Ray Tune and RLlib `__: + Example of how to write your own progress reporter (for a multi-agent experiment) and use it with Ray Tune and RLlib. + +- |new_stack| `How to define and plug in your custom logger into Ray Tune and RLlib `__: + How to setup a custom Logger object in RLlib and use it with Ray Tune. + +- |new_stack| `How to Custom tune experiment `__: + How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. + +RLModules +--------- + +- |new_stack| `How to Custom tune experiment `__: + How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. + +- |new_stack| `Autoregressive action distribution example `__: + Learning with an auto-regressive action distribution (for example, two action components, where distribution of the second component depends on the first's actually sampled value). + +- |old_stack| `Parametric actions `__: + Example of how to handle variable-length or parametric action spaces. + +- |old_stack| `Using the "Repeated" space of RLlib for variable lengths observations `__: + How to use RLlib's `Repeated` space to handle variable length observations. + + +Tuned Examples +++++++++++++++ + +- `Tuned examples `__: + Collection of tuned hyperparameters sorted by algorithm. + + + +TODO: clean up from here on + + +Custom- and Complex Models +-------------------------- + +- |old_stack| `Custom Keras model `__: + Example of using a custom Keras model. +- |old_stack| `Registering a custom model with supervised loss `__: + Example of defining and registering a custom model with a supervised loss. +- |old_stack| `Batch normalization `__: + Example of adding batch norm layers to a custom model. +- |old_stack| `Custom model API example `__: + Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. +- |old_stack| `Trajectory View API utilizing model `__: + An example on how a model can use the trajectory view API to specify its own input. +- |old_stack| `MobileNetV2 wrapping example model `__: + Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. +- |old_stack| `Differentiable Neural Computer `__: + Example of DeepMind's Differentiable Neural Computer for partially observable environments. + + +Serving and Offline +------------------- +- `Unity3D client/server `__: + Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting + clients against a central RLlib Policy server learning how to play the game. + The n distributed clients could themselves be servers for external/human players and allow for control + being fully in the hands of the Unity entities instead of RLlib. + Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. +- `CartPole client/server `__: + Example of online serving of predictions for a simple CartPole policy. +- `Saving experiences `__: + Example of how to externally generate experience batches in RLlib-compatible format. + + +Community Examples +------------------ +- |old_stack| `Arena AI `__: + A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence + with RLlib-generated baselines. +- |old_stack| `CARLA `__: + Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. +- |old_stack| `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: + Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the + "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). +- |old_stack| `Flatland `__: + A dense traffic simulating environment with RLlib-generated baselines. +- |old_stack| `GFootball `__: + Example of setting up a multi-agent version of `GFootball `__ with RLlib. +- |old_stack| `mobile-env `__: + An open, minimalist Gymnasium environment for autonomous coordination in wireless mobile networks. + Includes an example notebook using Ray RLlib for multi-agent RL with mobile-env. +- |old_stack| `Neural MMO `__: + A multiagent AI research environment inspired by Massively Multiplayer Online (MMO) role playing games – + self-contained worlds featuring thousands of agents per persistent macrocosm, diverse skilling systems, local and global economies, complex emergent social structures, + and ad-hoc high-stakes single and team based conflict. +- |old_stack| `NeuroCuts `__: + Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. +- |old_stack| `NeuroVectorizer `__: + Example of learning optimal LLVM vectorization compiler pragmas for loops in C and C++ codes using RLlib. +- |old_stack| `Roboschool / SageMaker `__: + Example of training robotic control policies in SageMaker with RLlib. +- |old_stack| `Sequential Social Dilemma Games `__: + Example of using the multi-agent API to model several `social dilemma games `__. +- |old_stack| `Simple custom environment for single RL with Ray and RLlib `__: + Create a custom environment and train a single agent RL using Ray 2.0 with Tune. +- |old_stack| `StarCraft2 `__: + Example of training in StarCraft2 maps with RLlib / multi-agent. +- |old_stack| `Traffic Flow `__: + Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. + + +Blog Posts +---------- + +- |old_stack| `Attention Nets and More with RLlib’s Trajectory View API `__: + Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. +- |old_stack| `Reinforcement Learning with RLlib in the Unity Game Engine `__: + How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. +- |old_stack| `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: + Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. +- |old_stack| `Scaling Multi-Agent Reinforcement Learning `__: + Blog post of a brief tutorial on multi-agent RL and its design in RLlib. +- |old_stack| `Functional RL with Keras and TensorFlow Eager `__: + Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. diff --git a/rllib/BUILD b/rllib/BUILD index 1f46b6618f21..d85f221df455 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2917,11 +2917,11 @@ py_test( ) py_test( - name = "examples/rl_modules/pretraining_single_agent_training_multi_agent_rlm", - main = "examples/rl_modules/pretraining_single_agent_training_multi_agent_rlm.py", + name = "examples/rl_modules/pretraining_single_agent_training_multi_agent", + main = "examples/rl_modules/pretraining_single_agent_training_multi_agent.py", tags = ["team:rllib", "examples"], size = "medium", - srcs = ["examples/rl_modules/pretraining_single_agent_training_multi_agent_rlm.py"], + srcs = ["examples/rl_modules/pretraining_single_agent_training_multi_agent.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iters-pretraining=5", "--stop-iters=20", "--stop-reward=150.0"], ) diff --git a/rllib/examples/README.rst b/rllib/examples/README.rst deleted file mode 100644 index 4f7d020b30c7..000000000000 --- a/rllib/examples/README.rst +++ /dev/null @@ -1,313 +0,0 @@ -.. |newstack| image:: images/tensorflow.png - :class: inline-figure - :width: 16 - - -Examples -======== - -This page contains an index of all the python scripts in the `examples/ folder ` -of RLlib, demonstrating the different use cases and features of the library. - -.. note:: - - RLlib is currently in a transition state from "old API stack" to "new API stack". - Some of the examples here haven't been translated yet to the new stack and are tagged - with the following comment line on top: ``# @OldAPIStack``. The moving of all example - scripts over to the "new API stack" is work in progress and expected to be completed - by the end of 2024. - -.. note:: - - If any new-API-stack example is broken, or if you'd like to add an example to this page, - feel free to raise an issue on RLlib's `Github repository `. - - -Folder Structure -++++++++++++++++ -The `examples/ folder ` is -structured into several sub-directories, the contents of all of which are described in detail below. - - -How to run an example script -++++++++++++++++++++++++++++ - -Most of the example scripts are self-executable, meaning you can just ``cd`` into the respective -directory and type: - -.. code-block:: bash - - $ cd examples/multi_agent - $ python multi_agent_pendulum.py --enable-new-api-stack --num-agents=2 - - -Use the `--help` command line argument to have each script print out its supported command line options. - -Most of the scripts share a common subset of generally applicable command line arguments, -for example `--num-env-runners`, `--no-tune`, or `--wandb-key`. - - -All sub-folders -+++++++++++++++ - - -Algorithms ----------- - -.. include:: algorithms/README.rst - -Catalogs --------- - -.. include:: catalogs/README.rst - -Checkpoints ------------ - -.. include:: checkpoints/README.rst - -Connectors ----------- - -.. include:: connectors/README.rst - -Curriculum Learning -------------------- - -.. include:: curriculum/README.rst - -Debugging ---------- - -.. include:: debugging/README.rst - -Environments ------------- - -.. include:: envs/README.rst - -Evaluation ----------- - -.. include:: evaluation/README.rst - -GPU (for Training and Sampling) -------------------------------- - -.. include:: gpus/README.rst - -Hierarchical Training ---------------------- - -.. include:: hierarchical/README.rst - -Inference (of Models/Policies) ------------------------------- - -.. include:: inference/README.rst - -Learners --------- - -.. include:: learners/README.rst - -Multi-Agent RL --------------- - -.. include:: multi_agent/README.rst - -Offline RL ----------- - -.. include:: offline/README.rst - -Ray Serve and RLlib -------------------- - -.. include:: ray_serve/README.rst - -Ray Tune and RLlib ------------------- - -.. include:: ray_tune/README.rst - -RLModules ---------- - -.. include:: rl_modules/README.rst - - -Tuned Examples -++++++++++++++ - - -- `Tuned examples `__: - Collection of tuned hyperparameters sorted by algorithm. - - - - -************************************************************** - - -Environments and Adapters -------------------------- - -- |oldstack| `Registering a custom env and model `__: - Example of defining and registering a gym env and model for use with RLlib. -- |oldstack| `Local Unity3D multi-agent environment example `__: - Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to - learn any Unity3D game (including support for multi-agent). - Use this example to try things out and watch the game and the learning progress live in the editor. - Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. - For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. - -Custom- and Complex Models --------------------------- - -- |oldstack| `Custom Keras model `__: - Example of using a custom Keras model. -- |oldstack| `Registering a custom model with supervised loss `__: - Example of defining and registering a custom model with a supervised loss. -- |oldstack| `Batch normalization `__: - Example of adding batch norm layers to a custom model. -- |oldstack| `Custom model API example `__: - Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. -- |oldstack| `Trajectory View API utilizing model `__: - An example on how a model can use the trajectory view API to specify its own input. -- |oldstack| `MobileNetV2 wrapping example model `__: - Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. -- |oldstack| `Differentiable Neural Computer `__: - Example of DeepMind's Differentiable Neural Computer for partially observable environments. - - -Training Workflows ------------------- - -- `Custom training workflows `__: - Example of how to use Tune's support for custom training functions to implement custom training workflows. -- `Custom logger `__: - How to setup a custom Logger object in RLlib. -- `Custom metrics `__: - Example of how to output custom training metrics to TensorBoard. -- `Custom tune experiment `__: - How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. - - -Evaluation: ------------ -- `Custom evaluation function `__: - Example of how to write a custom evaluation function that's called instead of the default behavior, which is running with the evaluation worker set through n episodes. -- `Parallel evaluation and training `__: - Example showing how the evaluation workers and the "normal" rollout workers can run (to some extend) in parallel to speed up training. - - -Serving and Offline -------------------- -- `Offline RL with CQL `__: - Example showing how to run an offline RL training job using a historic-data JSON file. -- `Another example for using RLlib with Ray Serve `__ - This script offers a simple workflow for 1) training a policy with RLlib first, 2) creating a new policy 3) restoring its weights from the trained - one and serving the new policy with Ray Serve. -- `Unity3D client/server `__: - Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting - clients against a central RLlib Policy server learning how to play the game. - The n distributed clients could themselves be servers for external/human players and allow for control - being fully in the hands of the Unity entities instead of RLlib. - Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. -- `CartPole client/server `__: - Example of online serving of predictions for a simple CartPole policy. -- `Saving experiences `__: - Example of how to externally generate experience batches in RLlib-compatible format. -- `Finding a checkpoint using custom criteria `__: - Example of how to find a :ref:`checkpoint ` after a `Tuner.fit()` with some custom defined criteria. - - -Multi-Agent and Hierarchical ----------------------------- - -- `Simple independent multi-agent setup vs a PettingZoo env `__: - Setup RLlib to run any algorithm in (independent) multi-agent mode against a multi-agent environment. -- `More complex (shared-parameter) multi-agent setup vs a PettingZoo env `__: - Setup RLlib to run any algorithm in (shared-parameter) multi-agent mode against a multi-agent environment. -- `Rock-paper-scissors heuristic vs learned `__ and `Rock-paper-scissors learned vs learned `__: - Two examples of different heuristic and learned policies competing against each other in the rock-paper-scissors environment. -- `Two-step game `__: - Example on how to use agent grouping in a multi-agent environment (the two-step game from the `QMIX paper `__). -- `PettingZoo multi-agent example `__: - Example on how to use RLlib to learn in `PettingZoo `__ multi-agent environments. -- `PPO with centralized critic on two-step game `__: - Example of customizing PPO to leverage a centralized value function. -- `Centralized critic in the env `__: - A simpler method of implementing a centralized critic by augmenting agent observations with global information. -- `Hand-coded policy `__: - Example of running a custom hand-coded policy alongside trainable policies. -- `Weight sharing between policies `__: - Example of how to define weight-sharing layers between two different policies. -- `Multiple algorithms `__: - Example of alternating training between DQN and PPO. -- `Hierarchical training `__: - Example of hierarchical training using the multi-agent API. - - -Special Action- and Observation Spaces --------------------------------------- - -- |newstack| `Autoregressive action distribution example `__: - Learning with an auto-regressive action distribution (for example, two action components, where distribution of the second component depends on the first's actually sampled value). -- |oldstack| `Parametric actions `__: - Example of how to handle variable-length or parametric action spaces. -- |oldstack| `Using the "Repeated" space of RLlib for variable lengths observations `__: - How to use RLlib's `Repeated` space to handle variable length observations. - - -Community Examples ------------------- -- |oldstack| `Arena AI `__: - A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence - with RLlib-generated baselines. -- |oldstack| `CARLA `__: - Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. -- |oldstack| `The Emergence of Adversarial Communication in Multi-Agent Reinforcement Learning `__: - Using Graph Neural Networks and RLlib to train multiple cooperative and adversarial agents to solve the - "cover the area"-problem, thereby learning how to best communicate (or - in the adversarial case - how to disturb communication) (`code `__). -- |oldstack| `Flatland `__: - A dense traffic simulating environment with RLlib-generated baselines. -- |oldstack| `GFootball `__: - Example of setting up a multi-agent version of `GFootball `__ with RLlib. -- |oldstack| `mobile-env `__: - An open, minimalist Gymnasium environment for autonomous coordination in wireless mobile networks. - Includes an example notebook using Ray RLlib for multi-agent RL with mobile-env. -- |oldstack| `Neural MMO `__: - A multiagent AI research environment inspired by Massively Multiplayer Online (MMO) role playing games – - self-contained worlds featuring thousands of agents per persistent macrocosm, diverse skilling systems, local and global economies, complex emergent social structures, - and ad-hoc high-stakes single and team based conflict. -- |oldstack| `NeuroCuts `__: - Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. -- |oldstack| `NeuroVectorizer `__: - Example of learning optimal LLVM vectorization compiler pragmas for loops in C and C++ codes using RLlib. -- |oldstack| `Roboschool / SageMaker `__: - Example of training robotic control policies in SageMaker with RLlib. -- |oldstack| `Sequential Social Dilemma Games `__: - Example of using the multi-agent API to model several `social dilemma games `__. -- |oldstack| `Simple custom environment for single RL with Ray and RLlib `__: - Create a custom environment and train a single agent RL using Ray 2.0 with Tune. -- |oldstack| `StarCraft2 `__: - Example of training in StarCraft2 maps with RLlib / multi-agent. -- |oldstack| `Traffic Flow `__: - Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. - - -Blog Posts ----------- - -- |oldstack| `Attention Nets and More with RLlib’s Trajectory View API `__: - Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. -- |oldstack| `Reinforcement Learning with RLlib in the Unity Game Engine `__: - How-To guide about connecting RLlib with the Unity3D game engine for running visual- and physics-based RL experiments. -- |oldstack| `Lessons from Implementing 12 Deep RL Algorithms in TF and PyTorch `__: - Discussion on how the Ray Team ported 12 of RLlib's algorithms from TensorFlow to PyTorch and the lessons learned. -- |oldstack| `Scaling Multi-Agent Reinforcement Learning `__: - Blog post of a brief tutorial on multi-agent RL and its design in RLlib. -- |oldstack| `Functional RL with Keras and TensorFlow Eager `__: - Exploration of a functional paradigm for implementing reinforcement learning (RL) algorithms. diff --git a/rllib/examples/algorithms/README.rst b/rllib/examples/algorithms/README.rst deleted file mode 100644 index fc6f009170de..000000000000 --- a/rllib/examples/algorithms/README.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. |newstack| image:: ../../doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg - :class: inline-figure - :width: 32 - -- |newstack| `Custom Algorith.training_step() method combining on- and off-policy learning `__: - Example of how to override the :py:meth:`~ray.rllib.algorithms.algorithm.training_step` method of the - :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel - (also using multi-agent API). diff --git a/rllib/examples/catalogs/README.rst b/rllib/examples/catalogs/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/checkpoints/README.rst b/rllib/examples/checkpoints/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/connectors/README.rst b/rllib/examples/connectors/README.rst deleted file mode 100644 index 119100eeb9ba..000000000000 --- a/rllib/examples/connectors/README.rst +++ /dev/null @@ -1,34 +0,0 @@ - -.. note:: - RLlib's Connector API has been re-written from scratch for the new API stack (|newstack|). - We are now referring to connector-pieces and -pipelines as :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` - (as opposed to ``Connector``, which continue to work on the old API stack |oldstack|). - - -- |newstack| `Atari image frame stacking `__: - An example using Atari framestacking in a very efficient manner, NOT in the environment itself (as a `gym.Wrapper`), - but by stacking the observations on-the-fly using `EnvToModule` and `LearnerConnector` pipelines. - This method of framestacking is more efficient as it avoids having to send large observation - tensors through the network (ray). - -- |newstack| `Mean/STD filtering of observations `__: - An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that filters all observations from the environment using a - plain mean/STD filter (i.e. shift by mean and divide by std-dev). This example demonstrates - how a stateful :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` class has its states - (here the means and std's of the individual observation items) coming from the different - :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances a) merged into one common state and - then b) broadcast again back to the remote :py:class:`~ray.rllib.env.env_runner.EnvRunner` workers. - -- |newstack| `Include previous-action(s) and/or previous reward(s) in RLModule inputs `__: - An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that adds the n previous action(s) - and/or the m previous reward(s) to the RLModule's input dict (to perform its forward passes, both - for inference and training). - -- |newstack| `Nested action spaces `__: - Learning in arbitrarily nested action spaces, using an env in which the action space equals the - observation space (both are complex, nested Dicts) and the policy has to pick actions - that closely match (or are identical) to the previously seen observations. - -- |newstack| `Nested observation spaces `__: - Learning in arbitrarily nested observation spaces - (using a CartPole-v1 variant with a nested Dict observation space). diff --git a/rllib/examples/curriculum/README.rst b/rllib/examples/curriculum/README.rst deleted file mode 100644 index bf88a730e83c..000000000000 --- a/rllib/examples/curriculum/README.rst +++ /dev/null @@ -1,12 +0,0 @@ - -.. |new_stack| image:: ../../../doc/source/rllib/images/sigils/rllib-sigil-new-api-stack.svg - :class: inline-figure - :width: 40 - -- |new_stack| `Curriculum learning with the custom callbacks API `__: - Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) - and thus help the learning algorithm to cope with an otherwise unsolvable task. - Also see the :doc:`curriculum learning how-to ` from the documentation. - - - diff --git a/rllib/examples/debugging/README.rst b/rllib/examples/debugging/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/envs/README.rst b/rllib/examples/envs/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/evaluation/README.rst b/rllib/examples/evaluation/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/gpus/README.rst b/rllib/examples/gpus/README.rst deleted file mode 100644 index 18b254f09b55..000000000000 --- a/rllib/examples/gpus/README.rst +++ /dev/null @@ -1,6 +0,0 @@ - -- `Using fractional GPUs for training your model `__: - If your model is small and easily fits on a single GPU and you want to therefore train - other models alongside it to save time and cost, this script shows you how to set up - your RLlib config with a fractional number of GPUs on the learner (model training) - side. diff --git a/rllib/examples/hierarchical/README.rst b/rllib/examples/hierarchical/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/inference/README.rst b/rllib/examples/inference/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/learners/README.rst b/rllib/examples/learners/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/metrics/custom_metrics_and_callbacks.py similarity index 100% rename from rllib/examples/custom_metrics_and_callbacks.py rename to rllib/examples/metrics/custom_metrics_and_callbacks.py diff --git a/rllib/examples/multi_agent/README.rst b/rllib/examples/multi_agent/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/offline_rl/README.rst b/rllib/examples/offline_rl/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/ray_serve/README.rst b/rllib/examples/ray_serve/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/ray_tune/README.rst b/rllib/examples/ray_tune/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/rl_modules/README.rst b/rllib/examples/rl_modules/README.rst deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent_rlm.py b/rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py similarity index 100% rename from rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent_rlm.py rename to rllib/examples/rl_modules/pretraining_single_agent_training_multi_agent.py From 23fcf8d9c04cb765fc715429f1fc2e29e65415e7 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 10 Jun 2024 14:50:12 +0200 Subject: [PATCH 14/23] Apply suggestions from code review Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> Signed-off-by: Sven Mika --- doc/source/rllib/rllib-advanced-api.rst | 28 +++++++++---------- .../_old_api_stack/complex_struct_space.py | 2 +- .../connectors/adapt_connector_policy.py | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/source/rllib/rllib-advanced-api.rst b/doc/source/rllib/rllib-advanced-api.rst index cb39b8fdc13d..aa125f5768b0 100644 --- a/doc/source/rllib/rllib-advanced-api.rst +++ b/doc/source/rllib/rllib-advanced-api.rst @@ -19,30 +19,30 @@ implement `custom training workflows (example) `__ -for another example of how curriculum learning can be done. +Normally, such a curriculum starts with setting the environment to an easy level and +then - as training progresses - transitions more toward a harder-to-solve difficulty. +See the `Reverse Curriculum Generation for Reinforcement Learning Agents `_ blog post +for another example of how you can do curriculum learning. RLlib's Algorithm and custom callbacks APIs allow for implementing any arbitrary -curricula. We will quickly touch on the `example script found here <>`__ to introduce -the basic concepts needed, then refer you +curricula. This `example script <>`__ introduces +the basic concepts you need to understand. -First, we define some env options. We will work with the `FrozenLake-v1` environment, -a grid world, whose map is fully customizable. Our three tasks (different env difficulties) -are represented by slightly different maps that our agent will have to navigate. +First, define some env options. This example uses the `FrozenLake-v1` environment, +a grid world, whose map is fully customizable. Three tasks of different env difficulties +are represented by slightly different maps that the agent has to navigate. .. literalinclude:: /../../rllib/examples/curriculum/curriculum_learning.py :language: python :start-after: __curriculum_learning_example_env_options__ :end-before: __END_curriculum_learning_example_env_options__ -Then, we define the central piece controlling the curriculum, which is a custom callbacks class -overriding the :py:meth:`~ray.rllib.algorithms.callbacks.Callbacks.on_train_result` +Then, define the central piece controlling the curriculum, which is a custom callbacks class +overriding the :py:meth:`~ray.rllib.algorithms.callbacks.Callbacks.on_train_result`. .. TODO move to doc_code and make it use algo configs. diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py index 4603ed882250..1a1ecd28e122 100644 --- a/rllib/examples/_old_api_stack/complex_struct_space.py +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -1,5 +1,5 @@ # @OldAPIStack -"""Example of using variable-length Repeated / struct observation spaces. +"""Example of using variable-length Repeated or struct observation spaces. This example demonstrates the following: - using a custom environment with Repeated / struct observations diff --git a/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py index ce2e99211ae4..db59a49dcdbc 100644 --- a/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/adapt_connector_policy.py @@ -1,6 +1,6 @@ # @OldAPIStack """This example script shows how to load a connector enabled policy, -and adapt/use it with a different version of the environment. +and adapt or use it with a different version of the environment. """ import gymnasium as gym From 8e2afcc17ee6ccc06415dfcc61cbcba8a9f31c7f Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 10 Jun 2024 14:56:45 +0200 Subject: [PATCH 15/23] Apply suggestions from code review Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> Signed-off-by: Sven Mika --- .../_old_api_stack/connectors/run_connector_policy.py | 4 ++-- .../connectors/self_play_with_policy_checkpoint.py | 4 ++-- rllib/examples/_old_api_stack/parametric_actions_cartpole.py | 2 +- .../parametric_actions_cartpole_embeddings_learnt_by_model.py | 2 +- .../_old_api_stack/remote_base_env_with_custom_api.py | 2 +- .../remote_envs_with_inference_done_on_main_node.py | 2 +- rllib/examples/_old_api_stack/sb2rllib_rllib_example.py | 2 +- rllib/examples/_old_api_stack/sb2rllib_sb_example.py | 2 +- rllib/examples/action_masking.py | 2 +- rllib/examples/centralized_critic.py | 2 +- rllib/examples/centralized_critic_2.py | 2 +- .../examples/compute_adapted_gae_on_postprocess_trajectory.py | 4 ++-- rllib/examples/curriculum/curriculum_learning.py | 2 +- rllib/examples/custom_recurrent_rnn_tokenizer.py | 2 +- rllib/examples/gpus/fractional_gpus_per_learner.py | 2 +- rllib/examples/offline_rl/custom_input_api.py | 2 +- rllib/examples/offline_rl/offline_rl.py | 2 +- 17 files changed, 20 insertions(+), 20 deletions(-) diff --git a/rllib/examples/_old_api_stack/connectors/run_connector_policy.py b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py index 7daf136cdc66..dd088e75b0aa 100644 --- a/rllib/examples/_old_api_stack/connectors/run_connector_policy.py +++ b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py @@ -1,6 +1,6 @@ # @OldAPIStack -"""This example script shows how to load a connector enabled policy, -and use it in a serving/inference setting. +"""This example script loads a connector enabled policy, +and uses it in a serving or inference setting. """ import gymnasium as gym diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index 460b7ad79058..f15994c0456c 100644 --- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -1,7 +1,7 @@ # @OldAPIStack -"""Example showing how one can restore a connector enabled TF policy +"""Example showing to restore a connector enabled TF policy checkpoint for a new self-play PyTorch training job. -The checkpointed policy may be trained with a different algorithm too. +You can train the checkpointed policy with a different algorithm too. """ import argparse diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py index c34d9133795b..2f8832af502a 100644 --- a/rllib/examples/_old_api_stack/parametric_actions_cartpole.py +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole.py @@ -1,5 +1,5 @@ # @OldAPIStack -"""Example of handling variable length and/or parametric action spaces. +"""Example of handling variable length or parametric action spaces. This toy example demonstrates the action-embedding based approach for handling large discrete action spaces (potentially infinite in size), similar to this example: diff --git a/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py index c025714c346e..2750e68ec4c1 100644 --- a/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py +++ b/rllib/examples/_old_api_stack/parametric_actions_cartpole_embeddings_learnt_by_model.py @@ -1,5 +1,5 @@ # @OldAPIStack -"""Example of handling variable length and/or parametric action spaces. +"""Example of handling variable length or parametric action spaces. This is a toy example of the action-embedding based approach for handling large discrete action spaces (potentially infinite in size), similar to this: diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py index ea01edcee5eb..4fa783196cf5 100644 --- a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -1,6 +1,6 @@ # @OldAPIStack """ -This script demonstrates how to specify custom env APIs in +This script specifies custom env APIs in combination with RLlib's `remote_worker_envs` setting, which parallelizes individual sub-envs within a vector env by making each one a Ray Actor. diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index cfc77968d7b9..eac4adbc6064 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -1,6 +1,6 @@ # @OldAPIStack """ -This script demonstrates how to specify n (vectorized) envs +This script specifies n (vectorized) envs as Ray remote (actors), such that stepping through these occurs in parallel. Also, actions for each env step are calculated on the "main" node. diff --git a/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py index 3dcafa05ac39..28b5ddd830b9 100644 --- a/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py +++ b/rllib/examples/_old_api_stack/sb2rllib_rllib_example.py @@ -1,6 +1,6 @@ # @OldAPIStack """ -Example script on how to train, save, load, and test an RLlib agent. +Example script that trains, saves, loads, and tests an RLlib agent. Equivalent script with stable baselines: sb2rllib_sb_example.py. Demonstrates transition from stable_baselines to Ray RLlib. diff --git a/rllib/examples/_old_api_stack/sb2rllib_sb_example.py b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py index 4f1be19c6a3b..8e3686074935 100644 --- a/rllib/examples/_old_api_stack/sb2rllib_sb_example.py +++ b/rllib/examples/_old_api_stack/sb2rllib_sb_example.py @@ -1,6 +1,6 @@ # @OldAPIStack """ -Example script on how to train, save, load, and test a stable baselines 2 agent. +Example script that trains, saves, loads, and tests a stable baselines 2 agent. Code taken and adjusted from SB2 docs: https://stable-baselines.readthedocs.io/en/master/guide/quickstart.html Equivalent script with RLlib: sb2rllib_rllib_example.py diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index b89c7d6b23d1..3d49117c94d4 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -1,6 +1,6 @@ # @OldAPIStack -"""Example showing how to use "action masking" in RLlib. +"""Example that uses "action masking" in RLlib. "Action masking" allows the agent to select actions based on the current observation. This is useful in many practical scenarios, where different diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 1505a01c0f21..a54caf84100c 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -1,7 +1,7 @@ # @OldAPIStack # *********************************************************************************** -# IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by +# IMPORTANT NOTE: This script uses the old API stack and will soon be replaced by # `ray.rllib.examples.multi_agent.pettingzoo_shared_value_function.py`! # *********************************************************************************** diff --git a/rllib/examples/centralized_critic_2.py b/rllib/examples/centralized_critic_2.py index 6a2392f96385..cdc86f218cee 100644 --- a/rllib/examples/centralized_critic_2.py +++ b/rllib/examples/centralized_critic_2.py @@ -1,7 +1,7 @@ # @OldAPIStack # *********************************************************************************** -# IMPORTANT NOTE: This script is using the old API stack and will soon be replaced by +# IMPORTANT NOTE: This script uses the old API stack and will soon be replaced by # `ray.rllib.examples.multi_agent.pettingzoo_shared_value_function.py`! # *********************************************************************************** diff --git a/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py b/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py index b7cc351be042..19c28bb3ccb4 100644 --- a/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py +++ b/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py @@ -1,8 +1,8 @@ # @OldAPIStack """ -Adapted (time-dependent) GAE for PPO algorithm can be activated by setting -use_adapted_gae=True in the policy config. Additionally, it is required that +Adapted (time-dependent) GAE for PPO algorithm that you can activate by setting +use_adapted_gae=True in the policy config. Additionally, it's required that "callbacks" include the custom callback class in the Algorithm's config. Furthermore, the env must return in its info dictionary a key-value pair of the form "d_ts": ... where the value is the length (time) of recent agent step. diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 96539e7c2ec9..1e7ba0250ae0 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -136,7 +136,7 @@ # __END_curriculum_learning_example_env_options__ -# Simple function sent to an EnvRunner to change the map of all its gym.Envs from +# Simple function sent to an EnvRunner to change the map of all its gym. Envs from # the current one to a new (tougher) one, in which the goal position is further away # from the starting position. Note that a map is a list of strings, each one # representing one row in the map. Each character in the strings represent a single diff --git a/rllib/examples/custom_recurrent_rnn_tokenizer.py b/rllib/examples/custom_recurrent_rnn_tokenizer.py index 59be6b31ea7e..fe1d6c225f21 100644 --- a/rllib/examples/custom_recurrent_rnn_tokenizer.py +++ b/rllib/examples/custom_recurrent_rnn_tokenizer.py @@ -1,6 +1,6 @@ # @OldAPIStack -"""Example of define custom tokenizers for recurrent models in RLModules. +"""Example of defining custom tokenizers for recurrent models in RLModules. This example shows the following steps: - Define a custom tokenizer for a recurrent encoder. diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py index ba56f5b88909..42e6595feb61 100644 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -102,7 +102,7 @@ .env_runners(num_env_runners=args.num_env_runners) # Define Learner scaling. .learners( - # How many Learner workers do we need? If you have more than 1 GPU, you + # How many Learner workers do we need? If you have more than 1 GPU, # should set this to the number of GPUs available. num_learners=args.num_learners, # How many GPUs does each Learner need? If you have more than 1 GPU or only diff --git a/rllib/examples/offline_rl/custom_input_api.py b/rllib/examples/offline_rl/custom_input_api.py index bd192184155f..789e64a2a357 100644 --- a/rllib/examples/offline_rl/custom_input_api.py +++ b/rllib/examples/offline_rl/custom_input_api.py @@ -1,6 +1,6 @@ # @OldAPIStack -"""Example of creating a custom input api +"""Example of creating a custom input API Custom input apis are useful when your data source is in a custom format or when it is necessary to use an external data loading mechanism. diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index b05c682f8cd5..b60e80e8dff0 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -1,6 +1,6 @@ # @OldAPIStack -"""Example on how to use CQL to learn from an offline json file. +"""Example on how to use CQL to learn from an offline JSON file. Important node: Make sure that your offline data file contains only a single timestep per line to mimic the way SAC pulls samples from From 8966d52f2ea2a99cd0684d239ee844cd2cad6caf Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 10 Jun 2024 15:00:43 +0200 Subject: [PATCH 16/23] Apply suggestions from code review Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> Signed-off-by: Sven Mika --- rllib/examples/gpus/fractional_gpus_per_learner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py index 42e6595feb61..5a33a24fa595 100644 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -103,7 +103,7 @@ # Define Learner scaling. .learners( # How many Learner workers do we need? If you have more than 1 GPU, - # should set this to the number of GPUs available. + # set this parameter to the number of GPUs available. num_learners=args.num_learners, # How many GPUs does each Learner need? If you have more than 1 GPU or only # one Learner, you should set this to 1, otherwise, set this to some From cbc7f5b43fd438ca7427524339133f2106c55305 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 10 Jun 2024 15:55:34 +0200 Subject: [PATCH 17/23] fix Signed-off-by: sven1977 --- doc/source/rllib/rllib-examples.rst | 73 ++++++++++++----------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index 2c0ebbc279ab..5912706c57e1 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -67,11 +67,6 @@ Algorithms :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel (also using multi-agent API). -Catalogs --------- - - - Checkpoints ----------- @@ -122,11 +117,12 @@ Curriculum Learning - |new_stack| `How to set up curriculum learning with the custom callbacks API `__: Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) and thus help the learning algorithm to cope with an otherwise unsolvable task. - Also see the :doc:`curriculum learning how-to ` from the documentation. + Also see the :doc:`curriculum learning how-to ` from the documentation. Debugging --------- - +- |old_stack| `How to train an RLlib algorithm using a deterministic/reproducible setup `__: + Example showing how you can train an RLlib algo in a deterministic, reproducible fashion using seeding. Environments ------------ @@ -143,6 +139,20 @@ Environments Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. +- |old_stack| `Unity3D client/server `__: + Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting + clients against a central RLlib Policy server learning how to play the game. + The n distributed clients could themselves be servers for external/human players and allow for control + being fully in the hands of the Unity entities instead of RLlib. + Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. + +- |old_stack| `CartPole client/server `__: + Example of online serving of predictions for a simple CartPole policy. + +- |old_stack| `Saving experiences `__: + Example of how to externally generate experience batches in RLlib-compatible format. + + Evaluation ---------- @@ -170,8 +180,12 @@ Hierarchical Training Inference (of Models/Policies) ------------------------------ -- - +- |old_stack| `How to do inference with an already trained policy `__: + Example of how to perform inference (compute actions) on an already trained policy. +- |old_stack| `How to do inference with an already trained (LSTM) policy `__: + Example of how to perform inference (compute actions) on an already trained (LSTM) policy. +- |old_stack| `How to do inference with an already trained (attention) policy `__: + Example of how to perform inference (compute actions) on an already trained (attention) policy. Metrics ------- @@ -233,33 +247,13 @@ Ray Tune and RLlib RLModules --------- -- |new_stack| `How to Custom tune experiment `__: - How to run a custom Ray Tune experiment with RLlib with custom training- and evaluation phases. - - |new_stack| `Autoregressive action distribution example `__: Learning with an auto-regressive action distribution (for example, two action components, where distribution of the second component depends on the first's actually sampled value). - |old_stack| `Parametric actions `__: Example of how to handle variable-length or parametric action spaces. - - |old_stack| `Using the "Repeated" space of RLlib for variable lengths observations `__: How to use RLlib's `Repeated` space to handle variable length observations. - - -Tuned Examples -++++++++++++++ - -- `Tuned examples `__: - Collection of tuned hyperparameters sorted by algorithm. - - - -TODO: clean up from here on - - -Custom- and Complex Models --------------------------- - - |old_stack| `Custom Keras model `__: Example of using a custom Keras model. - |old_stack| `Registering a custom model with supervised loss `__: @@ -276,22 +270,15 @@ Custom- and Complex Models Example of DeepMind's Differentiable Neural Computer for partially observable environments. -Serving and Offline -------------------- -- `Unity3D client/server `__: - Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting - clients against a central RLlib Policy server learning how to play the game. - The n distributed clients could themselves be servers for external/human players and allow for control - being fully in the hands of the Unity entities instead of RLlib. - Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. -- `CartPole client/server `__: - Example of online serving of predictions for a simple CartPole policy. -- `Saving experiences `__: - Example of how to externally generate experience batches in RLlib-compatible format. +Tuned Examples +++++++++++++++ + +- `Tuned examples `__: + Collection of tuned hyperparameters sorted by algorithm. Community Examples ------------------- +++++++++++++++++++ - |old_stack| `Arena AI `__: A General Evaluation Platform and Building Toolkit for Single/Multi-Agent Intelligence with RLlib-generated baselines. @@ -328,7 +315,7 @@ Community Examples Blog Posts ----------- +++++++++++ - |old_stack| `Attention Nets and More with RLlib’s Trajectory View API `__: Blog describing RLlib's new "trajectory view API" and how it enables implementations of GTrXL (attention net) architectures. From ed21e410706b7ed7d48bf0d224a59633ebcf9d70 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 11:12:11 +0200 Subject: [PATCH 18/23] fix Signed-off-by: sven1977 --- rllib/BUILD | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 487503715fc0..7fc6899cdf71 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -3051,11 +3051,11 @@ py_test( ) py_test( - name = "examples/custom_metrics_and_callbacks", - main = "examples/custom_metrics_and_callbacks.py", + name = "examples/metrics/custom_metrics_and_callbacks", + main = "examples/metrics/custom_metrics_and_callbacks.py", tags = ["team:rllib", "exclusive", "examples"], size = "small", - srcs = ["examples/custom_metrics_and_callbacks.py"], + srcs = ["examples/metrics/custom_metrics_and_callbacks.py"], args = ["--stop-iters=2"] ) From 3ea64bf49c40cfcbf33c59400f15d8e901de3f04 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 11:14:50 +0200 Subject: [PATCH 19/23] fix Signed-off-by: sven1977 --- doc/source/rllib/rllib-advanced-api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/rllib/rllib-advanced-api.rst b/doc/source/rllib/rllib-advanced-api.rst index aa125f5768b0..d9c31eb5cf71 100644 --- a/doc/source/rllib/rllib-advanced-api.rst +++ b/doc/source/rllib/rllib-advanced-api.rst @@ -36,7 +36,7 @@ First, define some env options. This example uses the `FrozenLake-v1` environmen a grid world, whose map is fully customizable. Three tasks of different env difficulties are represented by slightly different maps that the agent has to navigate. -.. literalinclude:: /../../rllib/examples/curriculum/curriculum_learning.py +.. literalinclude:: ../../../rllib/examples/curriculum/curriculum_learning.py :language: python :start-after: __curriculum_learning_example_env_options__ :end-before: __END_curriculum_learning_example_env_options__ From f192bc35635b3d27601f098f7ce4bbe0f2dbf4cf Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 12:03:02 +0200 Subject: [PATCH 20/23] fix Signed-off-by: sven1977 --- doc/source/rllib/rllib-advanced-api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/rllib/rllib-advanced-api.rst b/doc/source/rllib/rllib-advanced-api.rst index d9c31eb5cf71..5b35253f1539 100644 --- a/doc/source/rllib/rllib-advanced-api.rst +++ b/doc/source/rllib/rllib-advanced-api.rst @@ -29,7 +29,7 @@ See the `Reverse Curriculum Generation for Reinforcement Learning Agents `__ introduces +curricula. This `example script `__ introduces the basic concepts you need to understand. First, define some env options. This example uses the `FrozenLake-v1` environment, From 2dbe142d990786820afb8ce041045196cd48d653 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 14:15:56 +0200 Subject: [PATCH 21/23] fix Signed-off-by: sven1977 --- .../rllib/images/sigils/new-api-stack.svg | 2 +- .../rllib/images/sigils/old-api-stack.svg | 2 +- doc/source/rllib/index.rst | 2 +- doc/source/rllib/key-concepts.rst | 2 +- doc/source/rllib/package_ref/evaluation.rst | 2 +- doc/source/rllib/rllib-algorithms.rst | 2 +- doc/source/rllib/rllib-env.rst | 2 +- doc/source/rllib/rllib-examples.rst | 85 +++++++++++-------- doc/source/rllib/rllib-learner.rst | 4 +- doc/source/rllib/rllib-models.rst | 2 +- doc/source/rllib/rllib-torch2x.rst | 2 +- doc/source/rllib/rllib-training.rst | 8 +- rllib/algorithms/algorithm_config.py | 28 +++--- rllib/benchmarks/torch_compile/README.md | 2 +- rllib/core/learner/learner_group.py | 2 +- rllib/utils/error.py | 6 +- rllib/utils/test_utils.py | 2 +- 17 files changed, 85 insertions(+), 70 deletions(-) diff --git a/doc/source/rllib/images/sigils/new-api-stack.svg b/doc/source/rllib/images/sigils/new-api-stack.svg index ec8c5a035279..bf4c59e0058d 100644 --- a/doc/source/rllib/images/sigils/new-api-stack.svg +++ b/doc/source/rllib/images/sigils/new-api-stack.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib/images/sigils/old-api-stack.svg b/doc/source/rllib/images/sigils/old-api-stack.svg index 7c57ef12c9bd..fb819d4a1d78 100644 --- a/doc/source/rllib/images/sigils/old-api-stack.svg +++ b/doc/source/rllib/images/sigils/old-api-stack.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst index 7ef5b21a41ca..15ad1c0ce4d3 100644 --- a/doc/source/rllib/index.rst +++ b/doc/source/rllib/index.rst @@ -167,7 +167,7 @@ Feature Overview **RLlib Algorithms** ^^^ - Check out the many available RL algorithms of RLlib for model-free and model-based + See the many available RL algorithms of RLlib for model-free and model-based RL, on-policy and off-policy training, multi-agent RL, and more. +++ .. button-ref:: rllib-algorithms-doc diff --git a/doc/source/rllib/key-concepts.rst b/doc/source/rllib/key-concepts.rst index c291cb2c76f3..470e66ff71a8 100644 --- a/doc/source/rllib/key-concepts.rst +++ b/doc/source/rllib/key-concepts.rst @@ -114,7 +114,7 @@ The following figure shows *synchronous sampling*, the simplest of `these patter RLlib uses `Ray actors `__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism `__ used for training by changing the ``num_env_runners`` parameter. -Check out our `scaling guide `__ for more details here. +See this `scaling guide `__ for more details here. RL Modules diff --git a/doc/source/rllib/package_ref/evaluation.rst b/doc/source/rllib/package_ref/evaluation.rst index 1b8755369171..5d3db4f9d1cf 100644 --- a/doc/source/rllib/package_ref/evaluation.rst +++ b/doc/source/rllib/package_ref/evaluation.rst @@ -23,7 +23,7 @@ which sit inside a :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup` **A typical RLlib EnvRunnerGroup setup inside an RLlib Algorithm:** Each :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup` contains exactly one local :py:class:`~ray.rllib.env.env_runner.EnvRunner` object and N ray remote - :py:class:`~ray.rllib.env.env_runner.EnvRunner` (ray actors). + :py:class:`~ray.rllib.env.env_runner.EnvRunner` (Ray actors). The workers contain a policy map (with one or more policies), and - in case a simulator (env) is available - a vectorized :py:class:`~ray.rllib.env.base_env.BaseEnv` (containing M sub-environments) and a :py:class:`~ray.rllib.evaluation.sampler.SamplerInput` (either synchronous or asynchronous) which controls diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst index 0c0482336e59..0c867d2481e3 100644 --- a/doc/source/rllib/rllib-algorithms.rst +++ b/doc/source/rllib/rllib-algorithms.rst @@ -9,7 +9,7 @@ Algorithms .. tip:: - Check out the `environments `__ page to learn more about different environment types. + See the `environments `__ page to learn more about different environment types. Available Algorithms - Overview ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst index a76af8ac64cb..b992bbb7e8b7 100644 --- a/doc/source/rllib/rllib-env.rst +++ b/doc/source/rllib/rllib-env.rst @@ -11,7 +11,7 @@ RLlib works with several different types of environments, including `Farama-Foun .. tip:: - Not all environments work with all algorithms. Check out the `algorithm overview `__ for more information. + Not all environments work with all algorithms. See the `algorithm overview `__ for more information. .. image:: images/rllib-envs.svg diff --git a/doc/source/rllib/rllib-examples.rst b/doc/source/rllib/rllib-examples.rst index 5912706c57e1..ff27b6e6c1d0 100644 --- a/doc/source/rllib/rllib-examples.rst +++ b/doc/source/rllib/rllib-examples.rst @@ -4,17 +4,18 @@ .. |new_stack| image:: /rllib/images/sigils/new-api-stack.svg :class: inline-figure - :width: 32 + :width: 64 .. |old_stack| image:: /rllib/images/sigils/old-api-stack.svg :class: inline-figure - :width: 32 + :width: 64 Examples ======== -This page contains an index of all the python scripts in the `examples/ folder ` +This page contains an index of all the python scripts in the +`examples folder `__ of RLlib, demonstrating the different use cases and features of the library. .. note:: @@ -28,12 +29,12 @@ of RLlib, demonstrating the different use cases and features of the library. .. note:: If any new-API-stack example is broken, or if you'd like to add an example to this page, - feel free to raise an issue on RLlib's `github repository `. + feel free to raise an issue on `RLlib's github repository `__. Folder Structure ++++++++++++++++ -The `examples/ folder ` is +The `examples folder `__ is structured into several sub-directories, the contents of all of which are described in detail below. @@ -41,11 +42,11 @@ How to run an example script ++++++++++++++++++++++++++++ Most of the example scripts are self-executable, meaning you can just ``cd`` into the respective -directory and type: +directory and run the script as-is with python: .. code-block:: bash - $ cd examples/multi_agent + $ cd ray/rllib/examples/multi_agent $ python multi_agent_pendulum.py --enable-new-api-stack --num-agents=2 @@ -62,7 +63,7 @@ All sub-folders Algorithms ---------- -- |new_stack| `Custom Algorith.training_step() method combining on- and off-policy learning `__: +- |new_stack| `How to write a custom Algorith.training_step() method combining on- and off-policy training `__: Example of how to override the :py:meth:`~ray.rllib.algorithms.algorithm.training_step` method of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class to train two different policies in parallel (also using multi-agent API). @@ -91,7 +92,7 @@ Connectors - |new_stack| `How to mean/std-filter observations `__: An example of a :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` that filters all observations from the environment using a - plain mean/std-filter (that is shift by mean and divide by std-dev). This example demonstrates + plain mean/std-filter (shift by the mean and divide by std-dev). This example demonstrates how a stateful :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` class has its states (here the means and standard deviations of the individual observation items) coming from the different :py:class:`~ray.rllib.env.env_runner.EnvRunner` instances a) merged into one common state and @@ -115,14 +116,14 @@ Curriculum Learning ------------------- - |new_stack| `How to set up curriculum learning with the custom callbacks API `__: - Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) - and thus help the learning algorithm to cope with an otherwise unsolvable task. - Also see the :doc:`curriculum learning how-to ` from the documentation. + Example of how to make the environment go through different levels of difficulty (from easy to harder to solve) + and thus help the learning algorithm to cope with an otherwise unsolvable task. + Also see the :doc:`curriculum learning how-to ` from the documentation. Debugging --------- - |old_stack| `How to train an RLlib algorithm using a deterministic/reproducible setup `__: - Example showing how you can train an RLlib algo in a deterministic, reproducible fashion using seeding. + Example showing how you can train an RLlib algorithm in a deterministic, reproducible fashion using seeding. Environments ------------ @@ -132,26 +133,23 @@ Environments - |new_stack| `How to set up rendering (and recording) of the environment trajectories during training with WandB `__: Example showing how you can render and record episode trajectories of your gymnasium envs and log the videos to WandB. -- |old_stack| `Local Unity3D multi-agent environment example `__: +- |old_stack| `How to run a Unity3D multi-agent environment locally `__: Example of how to setup an RLlib Algorithm against a locally running Unity3D editor instance to learn any Unity3D game (including support for multi-agent). Use this example to try things out and watch the game and the learning progress live in the editor. Providing a compiled game, this example could also run in distributed fashion with `num_env_runners > 0`. For a more heavy-weight, distributed, cloud-based example, see ``Unity3D client/server`` below. -- |old_stack| `Unity3D client/server `__: +- |old_stack| `How to run with a Unity3D client/server setup `__: Example of how to setup n distributed Unity3D (compiled) games in the cloud that function as data collecting clients against a central RLlib Policy server learning how to play the game. The n distributed clients could themselves be servers for external/human players and allow for control being fully in the hands of the Unity entities instead of RLlib. Note: Uses Unity's MLAgents SDK (>=1.0) and supports all provided MLAgents example games and multi-agent setups. -- |old_stack| `CartPole client/server `__: +- |old_stack| `How to run with a CartPole client/server setup `__: Example of online serving of predictions for a simple CartPole policy. -- |old_stack| `Saving experiences `__: - Example of how to externally generate experience batches in RLlib-compatible format. - Evaluation ---------- @@ -212,19 +210,22 @@ Multi-Agent RL - |new_stack| `How to train a single policy (weight sharing) controlling more than one agents `__: Example of how to define weight-sharing layers between two different policies. -- |old_stack| `PPO with centralized critic on two-step game `__: +- |old_stack| `Hwo to write and set up a model with centralized critic `__: Example of customizing PPO to leverage a centralized value function. -- |old_stack| `Centralized critic in the env `__: +- |old_stack| `How to write and set up a model with centralized critic in the env `__: A simpler method of implementing a centralized critic by augmenting agent observations with global information. -- |old_stack| `Multiple algorithms `__: +- |old_stack| `How to combine multiple algorithms into onw using the multi-agent API `__: Example of alternating training between DQN and PPO. Offline RL ---------- -- |old_stack| `Offline RL with CQL `__: +- |old_stack| `How to run an offline RL experiment with CQL `__: Example showing how to run an offline RL training job using a historic-data JSON file. +- |old_stack| `How to save experiences from an environment for offline RL `__: + Example of how to externally generate experience batches in RLlib-compatible format. + Ray Serve and RLlib ------------------- @@ -247,34 +248,48 @@ Ray Tune and RLlib RLModules --------- -- |new_stack| `Autoregressive action distribution example `__: +- |new_stack| `How to configure an autoregressive action distribution `__: Learning with an auto-regressive action distribution (for example, two action components, where distribution of the second component depends on the first's actually sampled value). -- |old_stack| `Parametric actions `__: +- |old_stack| `How to train with parametric actions `__: Example of how to handle variable-length or parametric action spaces. -- |old_stack| `Using the "Repeated" space of RLlib for variable lengths observations `__: +- |old_stack| `How to using the "Repeated" space of RLlib for variable lengths observations `__: How to use RLlib's `Repeated` space to handle variable length observations. -- |old_stack| `Custom Keras model `__: +- |old_stack| `How to write a custom Keras model `__: Example of using a custom Keras model. -- |old_stack| `Registering a custom model with supervised loss `__: +- |old_stack| `How to register a custom model with supervised loss `__: Example of defining and registering a custom model with a supervised loss. -- |old_stack| `Batch normalization `__: +- |old_stack| `How to train with batch normalization `__: Example of adding batch norm layers to a custom model. -- |old_stack| `Custom model API example `__: +- |old_stack| `How to write a custom model with its custom API `__: Shows how to define a custom Model API in RLlib, such that it can be used inside certain algorithms. -- |old_stack| `Trajectory View API utilizing model `__: +- |old_stack| `How to write a "trajectory ciew API" utilizing model `__: An example on how a model can use the trajectory view API to specify its own input. -- |old_stack| `MobileNetV2 wrapping example model `__: +- |old_stack| `How to wrap MobileNetV2 into your RLlib model `__: Implementations of `tf.keras.applications.mobilenet_v2.MobileNetV2` and `torch.hub (mobilenet_v2)`-wrapping example models. -- |old_stack| `Differentiable Neural Computer `__: +- |old_stack| `How to setup a Differentiable Neural Computer `__: Example of DeepMind's Differentiable Neural Computer for partially observable environments. Tuned Examples ++++++++++++++ -- `Tuned examples `__: - Collection of tuned hyperparameters sorted by algorithm. +The `tuned examples `__ folder +contains python config files (yaml for the old API stack) that can be executed analogously to +all other example scripts described here in order to run tuned learning experiments +for the different algorithms and different environment types. + +For example, see this tuned Atari example for PPO, which learns to solve the Pong environment +in roughly 5min. It can be run like this on a single g5.24xlarge (or g6.24xlarge) machine with +4 GPUs and 96 CPUs: + +.. code-block:: bash + + $ cd ray/rllib/tuned_examples/ppo + $ python atari_ppo.py --env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95 + +Note that some of the files in this folder are used for RLlib's daily or weekly +release tests as well. Community Examples diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 4fb3ed9d0415..8fa793a08eb6 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -132,9 +132,9 @@ and :py:class:`~ray.rllib.core.learner.learner.Learner` APIs via the :py:class:` # LearnerGroup. config = ( PPOConfig() - # Number of Learner workers (ray actors). + # Number of Learner workers (Ray actors). # Use 0 for no actors, only create a local Learner. - # Use >=1 to create n DDP-style Learner workers (ray actors). + # Use >=1 to create n DDP-style Learner workers (Ray actors). .learners(num_learners=1) # Specify the learner's hyperparameters. .training( diff --git a/doc/source/rllib/rllib-models.rst b/doc/source/rllib/rllib-models.rst index e14713a216ab..001036405abe 100644 --- a/doc/source/rllib/rllib-models.rst +++ b/doc/source/rllib/rllib-models.rst @@ -332,7 +332,7 @@ Implementing custom Attention Networks Similar to the RNN case described above, you could also implement your own attention-based networks, instead of using the ``use_attention: True`` flag in your model config. -Check out RLlib's `GTrXL (Attention Net) `__ implementations +See RLlib's `GTrXL (Attention Net) `__ implementations (for `TF `__ and `PyTorch `__) to get a better idea on how to write your own models of this type. These are the models we use as wrappers when ``use_attention=True``. diff --git a/doc/source/rllib/rllib-torch2x.rst b/doc/source/rllib/rllib-torch2x.rst index ec3e50bf934d..06c7476e77bd 100644 --- a/doc/source/rllib/rllib-torch2x.rst +++ b/doc/source/rllib/rllib-torch2x.rst @@ -80,7 +80,7 @@ Some meta-level comments Exploration ------------ -In RLlib, you can now set the configuration so that it uses the compiled module during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it is recommended to use the ``ipex`` or ``onnxrt`` backend. However, you can still run the sampling part on GPUs as well by setting ``num_gpus_per_env_runner`` in which case other backends can be used as well. For enabling torch-compile during training you can also set `torch_compile_learner` equivalents. +In RLlib, you can now set the configuration so that it uses the compiled module during sampling of an RL agent training process. By default, the rollout workers run on CPU, therefore it's recommended to use the ``ipex`` or ``onnxrt`` backend. However, you can still run the sampling part on GPUs as well by setting ``num_gpus_per_env_runner`` in which case other backends can be used as well. For enabling torch-compile during training you can also set `torch_compile_learner` equivalents. diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 6bcf8672f276..fdb2682e4ab1 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -444,11 +444,11 @@ and 5 remote workers (responsible for sample collection). Since learning is most of the time done on the local worker, it may help to provide one or more GPUs to that worker via the ``num_gpus`` setting. -Similarly, the resource allocation to remote workers can be controlled via ``num_cpus_per_env_runner``, ``num_gpus_per_env_runner``, and ``custom_resources_per_env_runner``. +Similarly, you can control the resource allocation to remote workers with ``num_cpus_per_env_runner``, ``num_gpus_per_env_runner``, and ``custom_resources_per_env_runner``. -The number of GPUs can be fractional quantities (e.g. 0.5) to allocate only a fraction +The number of GPUs can be fractional quantities (for example, 0.5) to allocate only a fraction of a GPU. For example, with DQN you can pack five algorithms onto one GPU by setting -``num_gpus: 0.2``. Check out `this fractional GPU example here `__ +``num_gpus: 0.2``. See `this fractional GPU example here `__ as well that also demonstrates how environments (running on the remote workers) that require a GPU can benefit from the ``num_gpus_per_env_runner`` setting. @@ -493,7 +493,7 @@ Here are some rules of thumb for scaling training with RLlib. 2. If the environment is fast and the model is small (most models for RL are), use time-efficient algorithms such as :ref:`PPO `, or :ref:`IMPALA `. These can be scaled by increasing ``num_env_runners`` to add rollout workers. It may also make sense to enable `vectorization `__ for -inference. Make sure to set ``num_gpus: 1`` if you want to use a GPU. If the learner becomes a bottleneck, multiple GPUs can be used for learning by setting +inference. Make sure to set ``num_gpus: 1`` if you want to use a GPU. If the learner becomes a bottleneck, you can use multiple GPUs for learning by setting ``num_gpus > 1``. 3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_env_runner: 1``. If you only have a single GPU, consider ``num_env_runners: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker `__. diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 8d0b57c208fc..92c14477f600 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1965,26 +1965,26 @@ def learners( Args: num_learners: Number of Learner workers used for updating the RLModule. - A value of 0 means training will take place on a local Learner on main + A value of 0 means training takes place on a local Learner on main process CPUs or 1 GPU (determined by `num_gpus_per_learner`). For multi-gpu training, you have to set `num_learners` to > 1 and set - `num_gpus_per_learner` accordingly (e.g. 4 GPUs total and model fits on + `num_gpus_per_learner` accordingly (e.g., 4 GPUs total and model fits on 1 GPU: `num_learners=4; num_gpus_per_learner=1` OR 4 GPUs total and model requires 2 GPUs: `num_learners=2; num_gpus_per_learner=2`). num_cpus_per_learner: Number of CPUs allocated per Learner worker. Only necessary for custom processing pipeline inside each Learner requiring multiple CPU cores. Ignored if `num_learners=0`. num_gpus_per_learner: Number of GPUs allocated per Learner worker. If - `num_learners=0`, any value greater than 0 will run the - training on a single GPU on the main process, while a value of 0 will - run the training on main process CPUs. If `num_gpus_per_learner` is - > 0, then `num_cpus_per_learner` should not be changed (from its default + `num_learners=0`, any value greater than 0 runs the + training on a single GPU on the main process, while a value of 0 runs + the training on main process CPUs. If `num_gpus_per_learner` is > 0, + then you shouldn't change `num_cpus_per_learner` (from its default value of 1). local_gpu_idx: If `num_gpus_per_learner` > 0, and - `num_learners` < 2, then this GPU index will be used for - training. This is an index into the available + `num_learners` < 2, then RLlib uses this GPU index for training. This is + an index into the available CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` - then a `local_gpu_idx` of 0 will use the GPU with ID=1 on the node. + and `local_gpu_idx=0`, RLlib uses the GPU with ID=1 on the node. Returns: This updated AlgorithmConfig object. @@ -2060,8 +2060,8 @@ def training( worker. This setting only applies to the new API stack. The number of Learner workers can be set via `config.resources( num_learners=...)`. The total effective batch size is then - `num_learners` x `train_batch_size_per_learner` and can - be accessed via the property `AlgorithmConfig.total_train_batch_size`. + `num_learners` x `train_batch_size_per_learner` and you can + access it with the property `AlgorithmConfig.total_train_batch_size`. train_batch_size: Training batch size, if applicable. When on the new API stack, this setting should no longer be used. Instead, use `train_batch_size_per_learner` (in combination with @@ -4015,7 +4015,7 @@ def _validate_resources_settings(self): # Remove this once we are able to specify placement group bundle index in RLlib if self.num_cpus_per_learner > 1 and self.num_gpus_per_learner > 0: raise ValueError( - "Cannot set both `num_cpus_per_learner` > 1 and " + "Can't set both `num_cpus_per_learner` > 1 and " " `num_gpus_per_learner` > 0! Either set " "`num_cpus_per_learner` > 1 (and `num_gpus_per_learner`" "=0) OR set `num_gpus_per_learner` > 0 (and leave " @@ -4028,7 +4028,7 @@ def _validate_resources_settings(self): if self.num_learners == 0 and self.num_gpus_per_env_runner > 1: raise ValueError( "num_gpus_per_env_runner must be 0 (cpu) or 1 (gpu) when using local " - "mode (i.e. `num_learners=0`)" + "mode (i.e., `num_learners=0`)" ) def _validate_multi_agent_settings(self): @@ -4133,7 +4133,7 @@ def _validate_input_settings(self): ) if self.input_ == "dataset": - # If we need to read a ray dataset set the parallelism and + # If you need to read a Ray dataset set the parallelism and # num_cpus_per_read_task from rollout worker settings self.input_config["num_cpus_per_read_task"] = self.num_cpus_per_env_runner if self.in_evaluation: diff --git a/rllib/benchmarks/torch_compile/README.md b/rllib/benchmarks/torch_compile/README.md index f0216790ec7c..aca3935a12a1 100644 --- a/rllib/benchmarks/torch_compile/README.md +++ b/rllib/benchmarks/torch_compile/README.md @@ -54,7 +54,7 @@ config.framework( ) ``` -This benchmark script runs PPO algorithm with the default model architecture for Atari-Breakout game. It will run the training for `n` iterations for both compiled and non-compiled RLModules and reports the speedup. Note that negative speedup values mean a slowdown when you compile the module. +This benchmark script runs PPO algorithm with the default model architecture for Atari-Breakout game. It runs the training for `n` iterations for both compiled and non-compiled RLModules and reports the speedup. Note that negative speedup values mean a slowdown when you compile the module. To run the benchmark script, you need a ray cluster comprised of at least 129 CPUs (2x64 + 1) and 2 GPUs. If this is not accessible to you, you can change the number of sampling workers and batch size to make the requirements smaller. diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 5496cb0ce5ec..2b066f981943 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -140,7 +140,7 @@ def __init__( else: backend_config = _get_backend_config(learner_class) - # TODO (sven): Cannot set both `num_cpus_per_learner`>1 and + # TODO (sven): Can't set both `num_cpus_per_learner`>1 and # `num_gpus_per_learner`>0! Users must set one or the other due # to issues with placement group fragmentation. See # https://github.com/ray-project/ray/issues/35409 for more details. diff --git a/rllib/utils/error.py b/rllib/utils/error.py index 3e2b7a8f9474..b57681465f94 100644 --- a/rllib/utils/error.py +++ b/rllib/utils/error.py @@ -37,11 +37,11 @@ class NotSerializable(Exception): # num_gpus=n or num_gpus_per_env_runner=m settings. ERR_MSG_NO_GPUS = """Found {} GPUs on your machine (GPU devices found: {})! If your machine does not have any GPUs, you should set the config keys - `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0 (they may be set to - 1 by default for your particular RL algorithm).""" + `num_gpus_per_learner` and `num_gpus_per_env_runner` to 0. They may be set to + 1 by default for your particular RL algorithm.""" ERR_MSG_INVALID_ENV_DESCRIPTOR = """The env string you provided ('{}') is: -a) Not a supported/installed environment. +a) Not a supported or -installed environment. b) Not a tune-registered environment creator. c) Not a valid env class string. diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 7b16332206f3..eaed1c3d0590 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1408,7 +1408,7 @@ def run_rllib_example_script_experiment( trainable: The Trainable sub-class to run in the tune.Tuner. If None (default), use the registered RLlib Algorithm class specified by args.algo. tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner. - In case `args.wandb_key` is provided, will append a WandB logger to this + In case `args.wandb_key` is provided, appends a WandB logger to this list. keep_config: Set this to True, if you don't want this utility to change the given `base_config` in any way and leave it as-is. This is helpful From 5172242c7c2ab5168c28a0b16da8eb3b25e95129 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 15:24:13 +0200 Subject: [PATCH 22/23] fix Signed-off-by: sven1977 --- rllib/examples/gpus/fractional_gpus_per_learner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rllib/examples/gpus/fractional_gpus_per_learner.py b/rllib/examples/gpus/fractional_gpus_per_learner.py index 5a33a24fa595..b577f66d5d09 100644 --- a/rllib/examples/gpus/fractional_gpus_per_learner.py +++ b/rllib/examples/gpus/fractional_gpus_per_learner.py @@ -77,6 +77,7 @@ parser = add_rllib_example_script_args( default_iters=50, default_reward=180, default_timesteps=100000 ) +parser.set_defaults(num_env_runners=2) # TODO (sven): Retire the currently supported --num-gpus in favor of --num-learners. parser.add_argument("--num-learners", type=int, default=1) parser.add_argument("--num-gpus-per-learner", type=float, default=0.5) From 41102e57422771f8d7fd25b9629139c09589289a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 13:17:51 +0200 Subject: [PATCH 23/23] fix Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 3 ++- rllib/algorithms/algorithm_config.py | 3 ++- rllib/examples/envs/external_envs/cartpole_server.py | 2 +- rllib/examples/metrics/__init__.py | 0 4 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 rllib/examples/metrics/__init__.py diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 479502c764ab..876a47b121f4 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2747,7 +2747,8 @@ def merge_algorithm_configs( deprecation_warning( "callbacks dict interface", "a class extending rllib.algorithms.callbacks.DefaultCallbacks; " - "see `rllib/examples/custom_metrics_and_callbacks.py` for an example.", + "see `rllib/examples/metrics/custom_metrics_and_callbacks.py` for an " + "example.", error=True, ) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 92c14477f600..5529ebed8be8 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -2164,7 +2164,8 @@ def callbacks(self, callbacks_class) -> "AlgorithmConfig": callbacks_class: Callbacks class, whose methods will be run during various phases of training and environment sample collection. See the `DefaultCallbacks` class and - `examples/custom_metrics_and_callbacks.py` for more usage information. + `examples/metrics/custom_metrics_and_callbacks.py` for more usage + information. Returns: This updated AlgorithmConfig object. diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py index 5524602c8fe4..bb17089f43d2 100755 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ b/rllib/examples/envs/external_envs/cartpole_server.py @@ -33,7 +33,7 @@ from ray import air, tune from ray.air.constants import TRAINING_ITERATION from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.examples.custom_metrics_and_callbacks import MyCallbacks +from ray.rllib.examples.metrics.custom_metrics_and_callbacks import MyCallbacks from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, diff --git a/rllib/examples/metrics/__init__.py b/rllib/examples/metrics/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1