From c1e7c45f58c659132c56815b179e9493663fe73c Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 17 Sep 2024 17:51:00 +0200 Subject: [PATCH] [RLlib] Enhancements for multi-node/multi-GPU training and better EnvRunner error msg. (#47705) --- rllib/env/multi_agent_env_runner.py | 10 ++++++++-- rllib/env/single_agent_env_runner.py | 10 ++++++++-- rllib/utils/test_utils.py | 14 +++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index b6cf7eea5562..647073a96ca3 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -828,16 +828,22 @@ def make_env(self): remote=self.config.remote_worker_envs, ) + # No env provided -> Error. + if not self.config.env: + raise ValueError( + "`config.env` is not provided! You should provide a valid environment " + "to your config through `config.environment([env descriptor e.g. " + "'CartPole-v1'])`." + ) # Register env for the local context. # Note, `gym.register` has to be called on each worker. - if isinstance(self.config.env, str) and _global_registry.contains( + elif isinstance(self.config.env, str) and _global_registry.contains( ENV_CREATOR, self.config.env ): entry_point = partial( _global_registry.get(ENV_CREATOR, self.config.env), env_ctx, ) - else: entry_point = partial( _gym_env_creator, diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 73c8c417f7a1..857e7ab6fc58 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -794,16 +794,22 @@ def make_env(self) -> None: remote=self.config.remote_worker_envs, ) + # No env provided -> Error. + if not self.config.env: + raise ValueError( + "`config.env` is not provided! You should provide a valid environment " + "to your config through `config.environment([env descriptor e.g. " + "'CartPole-v1'])`." + ) # Register env for the local context. # Note, `gym.register` has to be called on each worker. - if isinstance(self.config.env, str) and _global_registry.contains( + elif isinstance(self.config.env, str) and _global_registry.contains( ENV_CREATOR, self.config.env ): entry_point = partial( _global_registry.get(ENV_CREATOR, self.config.env), env_ctx, ) - else: entry_point = partial( _gym_env_creator, diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 9ba8fc479f9a..9324d6b81e6f 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1392,15 +1392,19 @@ def run_rllib_example_script_experiment( # Define compute resources used automatically (only using the --num-gpus arg). # New stack. if config.enable_rl_module_and_learner: + # Do we have GPUs available in the cluster? + num_gpus = ray.cluster_resources().get("GPU", 0) + if args.num_gpus > 0 and num_gpus < args.num_gpus: + logger.warning( + f"You are running your script with --num-gpus={args.num_gpus}, " + f"but your cluster only has {num_gpus} GPUs! Will run " + f"with {num_gpus} CPU Learners instead." + ) # Define compute resources used. config.resources(num_gpus=0) config.learners( num_learners=args.num_gpus, - num_gpus_per_learner=( - 1 - if torch and torch.cuda.is_available() and args.num_gpus > 0 - else 0 - ), + num_gpus_per_learner=1 if num_gpus >= args.num_gpus > 0 else 0, ) config.resources(num_gpus=0) # Old stack.