From 9aed2355cd369c7eeb048e873e78321097157a6b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 21 Apr 2024 17:10:41 +0200 Subject: [PATCH 01/15] wip Signed-off-by: sven1977 --- rllib/BUILD | 14 - rllib/algorithms/algorithm.py | 1 - rllib/algorithms/algorithm_config.py | 58 +- rllib/algorithms/sac/tests/test_sac.py | 1 - .../algorithms/tests/test_worker_failures.py | 1 - rllib/env/multi_agent_env.py | 3 +- rllib/env/multi_agent_env_runner.py | 4 +- rllib/env/utils/__init__.py | 46 -- rllib/env/wrappers/group_agents_wrapper.py | 3 - rllib/env/wrappers/open_spiel.py | 1 - rllib/env/wrappers/unity3d_env.py | 4 - rllib/evaluation/rollout_worker.py | 21 +- rllib/evaluation/tests/test_env_runner_v2.py | 2 +- .../evaluation/tests/test_envs_that_crash.py | 6 - rllib/evaluation/tests/test_episode.py | 1 - rllib/evaluation/tests/test_episode_v2.py | 1 - rllib/evaluation/tests/test_rollout_worker.py | 10 +- .../examples/envs/classes/action_mask_env.py | 1 - .../envs/classes/cartpole_crashing.py | 3 - .../envs/classes/debug_counter_env.py | 1 - .../examples/envs/classes/pettingzoo_chess.py | 3 - .../envs/classes/pettingzoo_connect4.py | 3 - rllib/examples/envs/classes/two_step_game.py | 2 - rllib/examples/envs/classes/windy_maze_env.py | 1 - rllib/offline/estimators/tests/utils.py | 1 - .../backward_compat/test_gym_env_apis.py | 300 ---------- rllib/tests/test_nested_observation_spaces.py | 4 +- ...hing-and-stalling-recreate-workers-appo.py | 3 - ...cartpole-crashing-recreate-workers-appo.py | 3 - ...hing-and-stalling-recreate-workers-appo.py | 3 - ...cartpole-crashing-recreate-workers-appo.py | 3 - rllib/utils/gym.py | 87 --- rllib/utils/pre_checks/env.py | 531 +----------------- rllib/utils/serialization.py | 21 +- rllib/utils/tests/test_check_env.py | 368 ------------ 35 files changed, 34 insertions(+), 1481 deletions(-) delete mode 100644 rllib/tests/backward_compat/test_gym_env_apis.py delete mode 100644 rllib/utils/gym.py delete mode 100644 rllib/utils/tests/test_check_env.py diff --git a/rllib/BUILD b/rllib/BUILD index 7e8ac50ee373..dfbe4575de89 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1589,13 +1589,6 @@ py_test( srcs = ["utils/replay_buffers/tests/test_segment_tree_replay_buffer_api.py"] ) -py_test( - name = "test_check_env", - tags = ["team:rllib", "utils"], - size = "small", - srcs = ["utils/tests/test_check_env.py"] -) - py_test( name = "test_check_multi_agent", tags = ["team:rllib", "utils"], @@ -1627,13 +1620,6 @@ py_test( data = glob(["tests/backward_compat/checkpoints/**"]), ) -py_test( - name = "tests/backward_compat/test_gym_env_apis", - tags = ["team:rllib", "env"], - size = "large", - srcs = ["tests/backward_compat/test_gym_env_apis.py"] -) - py_test( name = "tests/test_algorithm_imports", tags = ["team:rllib", "tests_dir"], diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 0ebb729cca63..0ed94e8ba373 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2560,7 +2560,6 @@ def _is_multi_agent(self): return env_id, functools.partial( _gym_env_creator, env_descriptor=env_specifier, - auto_wrap_old_gym_envs=config.get("auto_wrap_old_gym_envs", True), ) # All other env classes: Call c'tor directly. else: diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 15ea5979adf3..af2bd3a23c2e 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -17,6 +17,7 @@ Union, ) +import gymnasium as gym from packaging import version import ray @@ -47,10 +48,6 @@ ) from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.from_config import NotProvided, from_config -from ray.rllib.utils.gym import ( - convert_old_gym_space_to_gymnasium_space, - try_import_gymnasium_and_gym, -) from ray.rllib.utils.policy import validate_policy_id from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.serialization import ( @@ -78,7 +75,6 @@ from ray.tune.result import TRIAL_INFO from ray.tune.tune import _Config -gym, old_gym = try_import_gymnasium_and_gym() Space = gym.Space """TODO(jungong, sven): in "offline_data" we can potentially unify all input types @@ -326,8 +322,6 @@ def __init__(self, algo_class=None): self.clip_rewards = None self.normalize_actions = True self.clip_actions = False - self.disable_env_checking = False - self.auto_wrap_old_gym_envs = True self.action_mask_key = "action_mask" # Whether this env is an atari env (for atari-specific preprocessing). # If not specified, we will try to auto-detect this. @@ -545,6 +539,8 @@ def __init__(self, algo_class=None): self.enable_async_evaluation = DEPRECATED_VALUE self.custom_async_evaluation_function = DEPRECATED_VALUE self._enable_rl_module_api = DEPRECATED_VALUE + self.auto_wrap_old_gym_envs = DEPRECATED_VALUE + self.disable_env_checking = DEPRECATED_VALUE # The following values have moved because of the new ReplayBuffer API self.buffer_size = DEPRECATED_VALUE @@ -1413,10 +1409,11 @@ def environment( clip_rewards: Optional[Union[bool, float]] = NotProvided, normalize_actions: Optional[bool] = NotProvided, clip_actions: Optional[bool] = NotProvided, - disable_env_checking: Optional[bool] = NotProvided, is_atari: Optional[bool] = NotProvided, - auto_wrap_old_gym_envs: Optional[bool] = NotProvided, action_mask_key: Optional[str] = NotProvided, + # Deprecated args. + auto_wrap_old_gym_envs=DEPRECATED_VALUE, + disable_env_checking=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the config's RL-environment settings. @@ -1456,24 +1453,27 @@ def environment( clip_actions: If True, the RLlib default ModuleToEnv connector will clip actions according to the env's bounds (before sending them into the `env.step()` call). - disable_env_checking: If True, disable the environment pre-checking module. is_atari: This config can be used to explicitly specify whether the env is an Atari env or not. If not specified, RLlib will try to auto-detect this. - auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using - the pre 0.24 gym APIs, e.g. reset() returning single obs and no info - dict). If True, RLlib will automatically wrap the given gym env class - with the gym-provided compatibility wrapper - (gym.wrappers.EnvCompatibility). If False, RLlib will produce a - descriptive error on which steps to perform to upgrade to gymnasium - (or to switch this flag to True). - action_mask_key: If observation is a dictionary, expect the value by + action_mask_key: If observation is a dictionary, expect the value by the key `action_mask_key` to contain a valid actions mask (`numpy.int8` array of zeros and ones). Defaults to "action_mask". Returns: This updated AlgorithmConfig object. """ + if auto_wrap_old_gym_envs != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.environment(auto_wrap_old_gym_envs=..)", + error=True, + ) + if disable_env_checking != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.environment(disable_env_checking=..)", + error=True, + ) + if env is not NotProvided: self.env = env if env_config is not NotProvided: @@ -1492,12 +1492,8 @@ def environment( self.normalize_actions = normalize_actions if clip_actions is not NotProvided: self.clip_actions = clip_actions - if disable_env_checking is not NotProvided: - self.disable_env_checking = disable_env_checking if is_atari is not NotProvided: self._is_atari = is_atari - if auto_wrap_old_gym_envs is not NotProvided: - self.auto_wrap_old_gym_envs = auto_wrap_old_gym_envs if action_mask_key is not NotProvided: self.action_mask_key = action_mask_key @@ -3187,16 +3183,8 @@ def get_multi_agent_setup( if policy_spec.policy_class is None and default_policy_class is not None: policies[pid].policy_class = default_policy_class - # In case - somehow - an old gym Space made it to here, convert it - # to the corresponding gymnasium space. - if old_gym and isinstance(policy_spec.observation_space, old_gym.Space): - policies[ - pid - ].observation_space = convert_old_gym_space_to_gymnasium_space( - policy_spec.observation_space - ) # Infer observation space. - elif policy_spec.observation_space is None: + if policy_spec.observation_space is None: if spaces is not None and pid in spaces: obs_space = spaces[pid][0] elif env_obs_space is not None: @@ -3251,14 +3239,8 @@ def get_multi_agent_setup( policies[pid].observation_space = obs_space - # In case - somehow - an old gym Space made it to here, convert it - # to the corresponding gymnasium space. - if old_gym and isinstance(policy_spec.action_space, old_gym.Space): - policies[pid].action_space = convert_old_gym_space_to_gymnasium_space( - policy_spec.action_space - ) # Infer action space. - elif policy_spec.action_space is None: + if policy_spec.action_space is None: if spaces is not None and pid in spaces: act_space = spaces[pid][1] elif env_act_space is not None: diff --git a/rllib/algorithms/sac/tests/test_sac.py b/rllib/algorithms/sac/tests/test_sac.py index e6d7c98c35df..dc3491a1e95f 100644 --- a/rllib/algorithms/sac/tests/test_sac.py +++ b/rllib/algorithms/sac/tests/test_sac.py @@ -33,7 +33,6 @@ class SimpleEnv(gym.Env): def __init__(self, config): - self._skip_env_checking = True if config.get("simplex_actions", False): self.action_space = Simplex((2,)) else: diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index 6589aaa46791..dc0f46569ae5 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -74,7 +74,6 @@ class FaultInjectEnv(gym.Env): def __init__(self, config): # Use RandomEnv to control episode length if needed. self.env = RandomEnv(config) - self._skip_env_checking = True self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.config = config diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index 430c8aca5562..19554ff24444 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -40,8 +40,7 @@ class MultiAgentEnv(gym.Env): """ def __init__(self): - # TODO (sven): super init call seems to have been missing. Since forever. - # super().__init__() + super().__init__() if not hasattr(self, "observation_space"): self.observation_space = None diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index dd0eedc145ec..120ef1dc6780 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -16,6 +16,7 @@ from ray.rllib.env.utils import _gym_env_creator from ray.rllib.evaluation.metrics import RolloutMetrics from ray.rllib.utils.annotations import override +from ray.rllib.utils.pre_checks.env import check_multiagent_environments from ray.rllib.utils.typing import EpisodeID, ModelWeights from ray.util.annotations import PublicAPI from ray.tune.registry import ENV_CREATOR, _global_registry @@ -719,7 +720,8 @@ def make_env(self): ) # Perform actual gym.make call. - self.env = gym.make("rllib-multi-agent-env-v0") + self.env: MultiAgentEnv = gym.make("rllib-multi-agent-env-v0") + check_multiagent_environments(self.env) self.num_envs = 1 # Create the MultiAgentEnv (is-a gymnasium env). diff --git a/rllib/env/utils/__init__.py b/rllib/env/utils/__init__.py index 3651c2a7806b..67dc49efd76b 100644 --- a/rllib/env/utils/__init__.py +++ b/rllib/env/utils/__init__.py @@ -4,17 +4,10 @@ import gymnasium as gym from ray.rllib.env.env_context import EnvContext -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.rllib.env.wrappers.multi_agent_env_compatibility import ( - MultiAgentEnvCompatibility, -) from ray.rllib.utils.error import ( ERR_MSG_INVALID_ENV_DESCRIPTOR, - ERR_MSG_OLD_GYM_API, EnvError, ) -from ray.rllib.utils.gym import check_old_gym_env -from ray.util import log_once from ray.util.annotations import PublicAPI @@ -78,7 +71,6 @@ def try_import_open_spiel(error: bool = False): def _gym_env_creator( env_context: EnvContext, env_descriptor: Union[str, Type[gym.Env]], - auto_wrap_old_gym_envs: bool = True, ) -> gym.Env: """Tries to create a gym env given an EnvContext object and descriptor. @@ -95,12 +87,6 @@ def _gym_env_creator( env_descriptor: The env descriptor as a gym-registered string, e.g. CartPole-v1, ALE/MsPacman-v5, or CartPoleContinuousBulletEnv-v0. Alternatively, the gym.Env subclass to use. - auto_wrap_old_gym_envs: Whether to auto-wrap old gym environments (using - the pre 0.24 gym APIs, e.g. reset() returning single obs and no info - dict). If True, RLlib will automatically wrap the given gym env class - with the gym-provided compatibility wrapper (gym.wrappers.EnvCompatibility). - If False, RLlib will produce a descriptive error on which steps to perform - to upgrade to gymnasium (or to switch this flag to True). Returns: The actual gym environment object. @@ -125,38 +111,6 @@ def _gym_env_creator( env = env_descriptor(env_context) else: env = gym.make(env_descriptor, **env_context) - # If we are dealing with an old gym-env API, use the provided compatibility - # wrapper. - if auto_wrap_old_gym_envs: - try: - # Call the env's reset() method to check for the env using the old - # gym (reset doesn't take `seed` and `options` args and returns only - # the initial observations) or new gymnasium APIs (reset takes `seed` - # and `options` AND returns observations and infos). - obs_and_infos = env.reset(seed=None, options={}) - # Check return values for correct gymnasium. - check_old_gym_env(reset_results=obs_and_infos) - # TypeError for `reset()` not accepting seed/options. - # ValueError for `check_old_gym_env` raising error if return values - # incorrect. - except Exception: - if log_once("auto_wrap_gym_api"): - logger.warning( - "`config.auto_wrap_old_gym_envs` is activated AND you seem to " - "have provided an old gym-API environment. RLlib will therefore" - " try to auto-fix the following error. However, please " - "consider switching over to the new `gymnasium` APIs:\n" - + ERR_MSG_OLD_GYM_API - ) - # Multi-agent case. - if isinstance(env, MultiAgentEnv): - env = MultiAgentEnvCompatibility(env) - # Single agent (gymnasium.Env) case. - else: - env = gym.wrappers.EnvCompatibility(env) - # Repeat the checks, now everything should work. - obs_and_infos = env.reset(seed=None, options={}) - check_old_gym_env(reset_results=obs_and_infos) except gym.error.Error: raise EnvError(ERR_MSG_INVALID_ENV_DESCRIPTOR.format(env_descriptor)) diff --git a/rllib/env/wrappers/group_agents_wrapper.py b/rllib/env/wrappers/group_agents_wrapper.py index 42bf90b54ae9..cb545070412b 100644 --- a/rllib/env/wrappers/group_agents_wrapper.py +++ b/rllib/env/wrappers/group_agents_wrapper.py @@ -58,9 +58,6 @@ def __init__( """ super().__init__() self.env = env - # Inherit wrapped env's `_skip_env_checking` flag. - if hasattr(self.env, "_skip_env_checking"): - self._skip_env_checking = self.env._skip_env_checking self.groups = groups self.agent_id_to_group = {} for group_id, agent_ids in groups.items(): diff --git a/rllib/env/wrappers/open_spiel.py b/rllib/env/wrappers/open_spiel.py index 9b3b62c405f1..f18dc675bf24 100644 --- a/rllib/env/wrappers/open_spiel.py +++ b/rllib/env/wrappers/open_spiel.py @@ -13,7 +13,6 @@ class OpenSpielEnv(MultiAgentEnv): def __init__(self, env): super().__init__() self.env = env - self._skip_env_checking = True # Agent IDs are ints, starting from 0. self.num_agents = self.env.num_players() # Store the open-spiel game type. diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index c2edf5608710..88d28369c6c6 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -64,10 +64,6 @@ def __init__( Note: The game itself may contain its own episode length limits, which are always obeyed (on top of this value here). """ - # Skip env checking as the nature of the agent IDs depends on the game - # running in the connected Unity editor. - self._skip_env_checking = True - super().__init__() if file_name is None: diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 8351f882e56e..b234e0653906 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -65,7 +65,7 @@ ) from ray.rllib.policy.torch_policy import TorchPolicy from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils import check_env, force_list +from ray.rllib.utils import force_list from ray.rllib.utils.annotations import OldAPIStack, override from ray.rllib.utils.debug import summarize, update_global_seed_if_necessary from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning @@ -409,9 +409,6 @@ def gen_rollouts(): clip_rewards = self.config.clip_rewards if self.env is not None: - # Validate environment (general validation function). - if not self.config.disable_env_checking: - check_env(self.env, self.config) # Custom validation function given, typically a function attribute of the # Algorithm. if validate_env is not None: @@ -2010,8 +2007,6 @@ def _get_output_creator_from_config(self): def _get_make_sub_env_fn( self, env_creator, env_context, validate_env, env_wrapper, seed ): - config = self.config - def _make_sub_env_local(vector_index): # Used to created additional environments during environment # vectorization. @@ -2021,20 +2016,6 @@ def _make_sub_env_local(vector_index): env_ctx = env_context.copy_with_overrides(vector_index=vector_index) # Create the sub-env. env = env_creator(env_ctx) - # Validate first. - if not config.disable_env_checking: - try: - check_env(env, config) - except Exception as e: - logger.warning( - "We've added a module for checking environments that " - "are used in experiments. Your env may not be set up" - "correctly. You can disable env checking for now by setting " - "`disable_env_checking` to True in your experiment config " - "dictionary. You can run the environment checking module " - "standalone by calling ray.rllib.utils.check_env(env)." - ) - raise e # Custom validation function given by user. if validate_env is not None: validate_env(env, env_ctx) diff --git a/rllib/evaluation/tests/test_env_runner_v2.py b/rllib/evaluation/tests/test_env_runner_v2.py index 8fa8564f6985..473bee13ba3d 100644 --- a/rllib/evaluation/tests/test_env_runner_v2.py +++ b/rllib/evaluation/tests/test_env_runner_v2.py @@ -149,7 +149,7 @@ def compute_actions( config = ( PPOConfig() .framework("torch") - .environment("env_under_test", disable_env_checking=True) + .environment("env_under_test") .rollouts( num_envs_per_worker=1, num_rollout_workers=0, diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index 85e69a501c8a..d8f7d4c2d195 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -60,7 +60,6 @@ def test_env_crash_during_sampling(self): "p_crash": 0.2, "init_time_s": 0.3, }, - disable_env_checking=True, ) ) @@ -99,7 +98,6 @@ def test_env_crash_on_one_worker_during_sampling_but_ignore(self): # Only crash on worker with index 1. "crash_on_worker_indices": [1], }, - disable_env_checking=True, ) ) @@ -140,8 +138,6 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): # Only crash on worker with index 2. "crash_on_worker_indices": [2], }, - # Make sure nothing happens during pre-checks. - disable_env_checking=True, ) .fault_tolerance(delay_between_worker_restarts_s=0) ) @@ -185,8 +181,6 @@ def test_env_crash_during_sampling_but_restart_only_crashed_sub_env(self): # Crash prob=1%. "p_crash": 0.01, }, - # Make sure nothing happens during pre-checks. - disable_env_checking=True, ) ) for multi_agent in [True]: # TODO, False]: diff --git a/rllib/evaluation/tests/test_episode.py b/rllib/evaluation/tests/test_episode.py index 10d42068abca..814e3598b9ae 100644 --- a/rllib/evaluation/tests/test_episode.py +++ b/rllib/evaluation/tests/test_episode.py @@ -94,7 +94,6 @@ def compute_actions( class EpisodeEnv(MultiAgentEnv): def __init__(self, episode_length, num): super().__init__() - self._skip_env_checking = True self.agents = [MockEnv3(episode_length) for _ in range(num)] self.terminateds = set() self.truncateds = set() diff --git a/rllib/evaluation/tests/test_episode_v2.py b/rllib/evaluation/tests/test_episode_v2.py index fcbc9904a097..c4d02adfa9cc 100644 --- a/rllib/evaluation/tests/test_episode_v2.py +++ b/rllib/evaluation/tests/test_episode_v2.py @@ -31,7 +31,6 @@ def compute_actions( class EpisodeEnv(MultiAgentEnv): def __init__(self, episode_length, num): super().__init__() - self._skip_env_checking = True self.agents = [MockEnv3(episode_length) for _ in range(num)] self.terminateds = set() self.truncateds = set() diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 48bc1e7f1a91..02a4b7b167ce 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -449,16 +449,8 @@ def init(self, config): super().__init__(config=config) def step(self, action): - # Ensure that it is called from inside the sampling process. - import inspect - - curframe = inspect.currentframe() - called_from_check = any( - frame[3] == "check_gym_environments" - for frame in inspect.getouterframes(curframe, 2) - ) # Check, whether the action is immutable. - if action.flags.writeable and not called_from_check: + if action.flags.writeable: self.test_case.assertFalse( action.flags.writeable, "Action is mutable" ) diff --git a/rllib/examples/envs/classes/action_mask_env.py b/rllib/examples/envs/classes/action_mask_env.py index 9693d1f63c10..7c67db342f72 100644 --- a/rllib/examples/envs/classes/action_mask_env.py +++ b/rllib/examples/envs/classes/action_mask_env.py @@ -9,7 +9,6 @@ class ActionMaskEnv(RandomEnv): def __init__(self, config): super().__init__(config) - self._skip_env_checking = True # Masking only works for Discrete actions. assert isinstance(self.action_space, Discrete) # Add action_mask to observations. diff --git a/rllib/examples/envs/classes/cartpole_crashing.py b/rllib/examples/envs/classes/cartpole_crashing.py index 1829b5e15104..fe5e4f14b4f4 100644 --- a/rllib/examples/envs/classes/cartpole_crashing.py +++ b/rllib/examples/envs/classes/cartpole_crashing.py @@ -95,9 +95,6 @@ def __init__(self, config=None): print(f"Initializing crashing env (with init-delay of {sample}sec) ...") time.sleep(sample) - # No env pre-checking? - self._skip_env_checking = config.get("skip_env_checking", False) - # Make sure envs don't crash at the same time. self._rng = np.random.RandomState() diff --git a/rllib/examples/envs/classes/debug_counter_env.py b/rllib/examples/envs/classes/debug_counter_env.py index 04016dd41583..24d95e56b3bf 100644 --- a/rllib/examples/envs/classes/debug_counter_env.py +++ b/rllib/examples/envs/classes/debug_counter_env.py @@ -50,7 +50,6 @@ def __init__(self, config): self.timesteps = [0] * self.num_agents self.terminateds = set() self.truncateds = set() - self._skip_env_checking = True def reset(self, *, seed=None, options=None): self.timesteps = [0] * self.num_agents diff --git a/rllib/examples/envs/classes/pettingzoo_chess.py b/rllib/examples/envs/classes/pettingzoo_chess.py index ca36acc3c8e3..697ab01157f5 100644 --- a/rllib/examples/envs/classes/pettingzoo_chess.py +++ b/rllib/examples/envs/classes/pettingzoo_chess.py @@ -113,9 +113,6 @@ def __init__( else: self.env = env self.env.reset() - # TODO (avnishn): Remove this after making petting zoo env compatible with - # check_env. - self._skip_env_checking = True self.config = config if self.config is None: diff --git a/rllib/examples/envs/classes/pettingzoo_connect4.py b/rllib/examples/envs/classes/pettingzoo_connect4.py index 6398b64df1c1..e87861b97cc1 100644 --- a/rllib/examples/envs/classes/pettingzoo_connect4.py +++ b/rllib/examples/envs/classes/pettingzoo_connect4.py @@ -113,9 +113,6 @@ def __init__( else: self.env = env self.env.reset() - # TODO (avnishn): Remove this after making petting zoo env compatible with - # check_env. - self._skip_env_checking = True self.config = config # Get first observation space, assuming all agents have equal space diff --git a/rllib/examples/envs/classes/two_step_game.py b/rllib/examples/envs/classes/two_step_game.py index 540b22b534b3..bd40b03a0b08 100644 --- a/rllib/examples/envs/classes/two_step_game.py +++ b/rllib/examples/envs/classes/two_step_game.py @@ -13,7 +13,6 @@ def __init__(self, env_config): self.state = None self.agent_1 = 0 self.agent_2 = 1 - self._skip_env_checking = True # MADDPG emits action logits instead of actual discrete actions self.actions_are_logits = env_config.get("actions_are_logits", False) self.one_hot_state_encoding = env_config.get("one_hot_state_encoding", False) @@ -118,7 +117,6 @@ def __init__(self, env_config): self._obs_space_in_preferred_format = True self.action_space = Dict({"agents": self.env.action_space}) self._action_space_in_preferred_format = True - self._skip_env_checking = True def reset(self, *, seed=None, options=None): return self.env.reset(seed=seed, options=options) diff --git a/rllib/examples/envs/classes/windy_maze_env.py b/rllib/examples/envs/classes/windy_maze_env.py index b97a89d7fd6c..0a86fe4f9069 100644 --- a/rllib/examples/envs/classes/windy_maze_env.py +++ b/rllib/examples/envs/classes/windy_maze_env.py @@ -88,7 +88,6 @@ def _get_new_pos(self, pos, direction): class HierarchicalWindyMazeEnv(MultiAgentEnv): def __init__(self, env_config): super().__init__() - self._skip_env_checking = True self.flat_env = WindyMazeEnv(env_config) def reset(self, *, seed=None, options=None): diff --git a/rllib/offline/estimators/tests/utils.py b/rllib/offline/estimators/tests/utils.py index 37050244b8fe..2f28da1c7a25 100644 --- a/rllib/offline/estimators/tests/utils.py +++ b/rllib/offline/estimators/tests/utils.py @@ -45,7 +45,6 @@ def get_cliff_walking_wall_policy_and_data( AlgorithmConfig() .debugging(seed=seed) .rollouts(batch_mode="complete_episodes") - .environment(disable_env_checking=True) .experimental(_disable_preprocessor_api=True) ) config = config.to_dict() diff --git a/rllib/tests/backward_compat/test_gym_env_apis.py b/rllib/tests/backward_compat/test_gym_env_apis.py deleted file mode 100644 index 0ffdaae9ff60..000000000000 --- a/rllib/tests/backward_compat/test_gym_env_apis.py +++ /dev/null @@ -1,300 +0,0 @@ -import unittest - -import ray -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.rllib.env.wrappers.multi_agent_env_compatibility import ( - MultiAgentEnvCompatibility, -) -from ray.rllib.utils.gym import try_import_gymnasium_and_gym -from ray.tune.registry import register_env - -gym, old_gym = try_import_gymnasium_and_gym() - - -class GymnasiumOldAPI(gym.Env): - def __init__(self, config=None): - self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,)) - self.action_space = gym.spaces.Discrete(2) - - def reset(self): - return self.observation_space.sample() - - def step(self, action): - done = True - return self.observation_space.sample(), 1.0, done, {} - - def seed(self, seed=None): - pass - - def render(self, mode="human"): - pass - - -class GymnasiumNewAPIButOldSpaces(gym.Env): - render_mode = "human" - - def __init__(self, config=None): - self.observation_space = old_gym.spaces.Box(-1.0, 1.0, (1,)) - self.action_space = old_gym.spaces.Discrete(2) - - def reset(self, *, seed=None, options=None): - return self.observation_space.sample(), {} - - def step(self, action): - terminated = truncated = True - return self.observation_space.sample(), 1.0, terminated, truncated, {} - - def render(self): - pass - - -class GymnasiumNewAPIButThrowsErrorOnReset(gym.Env): - render_mode = "human" - - def __init__(self, config=None): - self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,)) - self.action_space = gym.spaces.Discrete(2) - - def reset(self, *, seed=None, options=None): - assert False, "kaboom!" - return self.observation_space.sample(), {} - - def step(self, action): - terminated = truncated = True - return self.observation_space.sample(), 1.0, terminated, truncated, {} - - def render(self): - pass - - -class OldGymEnv(old_gym.Env): - def __init__(self, config=None): - self.observation_space = old_gym.spaces.Box(-1.0, 1.0, (1,)) - self.action_space = old_gym.spaces.Discrete(2) - - def reset(self): - return self.observation_space.sample() - - def step(self, action): - done = True - return self.observation_space.sample(), 1.0, done, {} - - def seed(self, seed=None): - pass - - def render(self, mode="human"): - pass - - -class MultiAgentGymnasiumOldAPI(MultiAgentEnv): - def __init__(self, config=None): - super().__init__() - self.observation_space = gym.spaces.Dict( - {"agent0": gym.spaces.Box(-1.0, 1.0, (1,))} - ) - self.action_space = gym.spaces.Dict({"agent0": gym.spaces.Discrete(2)}) - self._agent_ids = {"agent0"} - - def reset(self): - return {"agent0": self.observation_space.sample()} - - def step(self, action): - done = True - return ( - {"agent0": self.observation_space.sample()}, - {"agent0": 1.0}, - {"agent0": done, "__all__": done}, - {}, - ) - - def seed(self, seed=None): - pass - - def render(self, mode="human"): - pass - - -class TestGymEnvAPIs(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_gymnasium_old_api(self): - """Tests a gymnasium Env that uses the old API.""" - - def test_(): - ( - PPOConfig() - .environment(env=GymnasiumOldAPI, auto_wrap_old_gym_envs=False) - # Forces the error to be raised on the local worker so that it is not - # swallowed by a RayActorError and speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - - self.assertRaisesRegex( - ValueError, - ".*In particular, the `reset\\(\\)` method seems to be faulty..*", - lambda: test_(), - ) - - def test_gymnasium_old_api_using_auto_wrap(self): - """Tests a gymnasium Env that uses the old API, but is auto-wrapped by RLlib.""" - algo = ( - PPOConfig() - .environment(env=GymnasiumOldAPI, auto_wrap_old_gym_envs=True) - # Speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - algo.train() - algo.stop() - - def test_gymnasium_new_api_but_old_spaces(self): - """Tests a gymnasium Env that uses the new API, but has old spaces.""" - - def test_(): - ( - PPOConfig() - .environment(GymnasiumNewAPIButOldSpaces, auto_wrap_old_gym_envs=True) - # Forces the error to be raised on the local worker so that it is not - # swallowed by a RayActorError and speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - - self.assertRaisesRegex( - ValueError, - "Observation space must be a gymnasium.Space!", - lambda: test_(), - ) - - def test_gymnasium_new_api_but_throws_error_on_reset(self): - """Tests a gymnasium Env that uses the new API, but errors on reset() call.""" - - def test_(): - ( - PPOConfig() - .environment( - GymnasiumNewAPIButThrowsErrorOnReset, - auto_wrap_old_gym_envs=True, - ) - # Forces the error to be raised on the local worker so that it is not - # swallowed by a RayActorError and speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - - self.assertRaisesRegex(AssertionError, "kaboom!", lambda: test_()) - - def test_gymnasium_old_api_but_manually_wrapped(self): - """Tests a gymnasium Env that uses the old API, but is correctly wrapped.""" - - from gymnasium.wrappers import EnvCompatibility - - register_env( - "test", - lambda env_ctx: EnvCompatibility(GymnasiumOldAPI(env_ctx)), - ) - - algo = ( - PPOConfig() - .environment("test", auto_wrap_old_gym_envs=False) - # Speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - algo.train() - algo.stop() - - def test_old_gym_env(self): - """Tests a old gym.Env (should fail, even with auto-wrapping enabled).""" - - def test_(): - ( - PPOConfig() - .environment(env=OldGymEnv, auto_wrap_old_gym_envs=True) - # Forces the error to be raised on the local worker so that it is not - # swallowed by a RayActorError and speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - - self.assertRaisesRegex( - ValueError, - "does not abide to the new gymnasium-style API", - lambda: test_(), - ) - - def test_multi_agent_gymnasium_old_api(self): - """Tests a MultiAgentEnv (gymnasium.Env subclass) that uses the old API.""" - - def test_(): - ( - PPOConfig() - .environment( - MultiAgentGymnasiumOldAPI, - auto_wrap_old_gym_envs=False, - ) - # Forces the error to be raised on the local worker so that it is not - # swallowed by a RayActorError and speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - - self.assertRaisesRegex( - ValueError, - ".*In particular, the `reset\\(\\)` method seems to be faulty..*", - lambda: test_(), - ) - - def test_multi_agent_gymnasium_old_api_auto_wrapped(self): - """Tests a MultiAgentEnv (gymnasium.Env subclass) that uses the old API.""" - - algo = ( - PPOConfig() - .environment( - MultiAgentGymnasiumOldAPI, - auto_wrap_old_gym_envs=True, - disable_env_checking=True, - ) - # Speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - algo.train() - algo.stop() - - def test_multi_agent_gymnasium_old_api_manually_wrapped(self): - """Tests a MultiAgentEnv (gymnasium.Env subclass) that uses the old API.""" - - register_env( - "test", - lambda env_ctx: MultiAgentEnvCompatibility( - MultiAgentGymnasiumOldAPI(env_ctx) - ), - ) - - algo = ( - PPOConfig() - .environment( - "test", auto_wrap_old_gym_envs=False, disable_env_checking=True - ) - # Speeds the test up. - .rollouts(num_rollout_workers=0) - .build() - ) - algo.train() - algo.stop() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index 2fab23c2047c..0c6439301aff 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -409,7 +409,7 @@ def do_test_nested_dict(self, make_env, test_lstm=False): config = ( PPOConfig() .experimental(_disable_preprocessor_api=True) - .environment("nested", disable_env_checking=True) + .environment("nested") .rollouts(num_rollout_workers=0, rollout_fragment_length=5) .framework("tf") .training( @@ -443,7 +443,7 @@ def do_test_nested_tuple(self, make_env): config = ( PPOConfig() .experimental(_disable_preprocessor_api=True) - .environment("nested2", disable_env_checking=True) + .environment("nested2") .rollouts(num_rollout_workers=0, rollout_fragment_length=5) .framework("tf") .training( diff --git a/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py index 9f64ffcccbd0..786f669376d6 100644 --- a/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -33,9 +33,6 @@ "stall_time_sec": (2, 5), # stall between 2 and 10sec. "stall_on_worker_indices": [2, 3], }, - # Disable env checking. Env checker doesn't handle Exceptions from - # user envs, and will crash rollout worker. - disable_env_checking=True, ) .rollouts( num_rollout_workers=1, diff --git a/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py index 30a205944331..3bb04eebf0a6 100644 --- a/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py @@ -28,9 +28,6 @@ "p_crash_reset": 0.005, # prob to crash during reset() "crash_on_worker_indices": [1, 2], }, - # Disable env checking. Env checker doesn't handle Exceptions from - # user envs, and will crash rollout worker. - disable_env_checking=True, ) .rollouts( num_rollout_workers=3, diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py index 11208ba336ec..83c76d38a259 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -34,9 +34,6 @@ "stall_time_sec": (2, 5), # stall between 2 and 10sec. "stall_on_worker_indices": [2, 3], }, - # Disable env checking. Env checker doesn't handle Exceptions from - # user envs, and will crash rollout worker. - disable_env_checking=True, ) .rollouts( num_rollout_workers=3, diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py index ecfab30e2795..312aae1aa83f 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py @@ -29,9 +29,6 @@ "p_crash": 0.0005, # prob to crash during step() "p_crash_reset": 0.005, # prob to crash during reset() }, - # Disable env checking. Env checker doesn't handle Exceptions from - # user envs, and will crash rollout worker. - disable_env_checking=True, ) .rollouts( num_rollout_workers=4, diff --git a/rllib/utils/gym.py b/rllib/utils/gym.py deleted file mode 100644 index 55f16aa8713e..000000000000 --- a/rllib/utils/gym.py +++ /dev/null @@ -1,87 +0,0 @@ -import gymnasium as gym -from typing import Optional - -from ray.util.annotations import DeveloperAPI - - -@DeveloperAPI -def check_old_gym_env( - env: Optional[gym.Env] = None, *, step_results=None, reset_results=None -): - # Check `reset()` results. - if reset_results is not None: - if ( - # Result is NOT a tuple? - not isinstance(reset_results, tuple) - # Result is a tuple of len!=2? - or len(reset_results) != 2 - # The second item is a NOT dict (infos)? - or not isinstance(reset_results[1], dict) - # Result is a tuple of len=2 and the second item is a dict (infos) and - # our env does NOT have obs space 2-Tuple with the second space being a - # dict? - or ( - env - and isinstance(env.observation_space, gym.spaces.Tuple) - and len(env.observation_space.spaces) >= 2 - and isinstance(env.observation_space.spaces[1], gym.spaces.Dict) - ) - ): - raise ValueError( - "The number of values returned from `gym.Env.reset(seed=.., options=..)" - "` must be 2! Make sure your `reset()` method returns: [obs] and " - "[infos]." - ) - # Check `step()` results. - elif step_results is not None: - if len(step_results) == 5: - return False - else: - raise ValueError( - "The number of values returned from `gym.Env.step([action])` must be " - "5 (new gym.Env API including `truncated` flags)! Make sure your " - "`step()` method returns: [obs], [reward], [terminated], " - "[truncated], and [infos]!" - ) - - else: - raise AttributeError( - "Either `step_results` or `reset_results` most be provided to " - "`check_old_gym_env()`!" - ) - return False - - -@DeveloperAPI -def convert_old_gym_space_to_gymnasium_space(space) -> gym.Space: - """Converts an old gym (NOT gymnasium) Space into a gymnasium.Space. - - Args: - space: The gym.Space to convert to gymnasium.Space. - - Returns: - The converted gymnasium.space object. - """ - from ray.rllib.utils.serialization import gym_space_from_dict, gym_space_to_dict - - return gym_space_from_dict(gym_space_to_dict(space)) - - -@DeveloperAPI -def try_import_gymnasium_and_gym(): - try: - import gymnasium as gym - except (ImportError, ModuleNotFoundError): - raise ImportError( - "The `gymnasium` package seems to be not installed! As of Ray 2.2, it is " - "required for RLlib. Try running `pip install gymnasium` from the " - "command line to fix this problem." - ) - - old_gym = None - try: - import gym as old_gym - except (ImportError, ModuleNotFoundError): - pass - - return gym, old_gym diff --git a/rllib/utils/pre_checks/env.py b/rllib/utils/pre_checks/env.py index a1c60da96a7b..0f74d9a64ffc 100644 --- a/rllib/utils/pre_checks/env.py +++ b/rllib/utils/pre_checks/env.py @@ -1,280 +1,22 @@ """Common pre-checks for all RLlib experiments.""" import logging -import traceback from copy import copy -from typing import TYPE_CHECKING, Optional, Set, Union +from typing import TYPE_CHECKING, Set +import gymnasium as gym import numpy as np import tree # pip install dm_tree -from ray.actor import ActorHandle from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.utils.error import ERR_MSG_OLD_GYM_API, UnsupportedSpaceException -from ray.rllib.utils.gym import check_old_gym_env, try_import_gymnasium_and_gym -from ray.rllib.utils.spaces.space_utils import ( - convert_element_to_space_type, - get_base_struct_from_space, -) -from ray.rllib.utils.typing import EnvType +from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space from ray.util import log_once if TYPE_CHECKING: - from ray.rllib.algorithms.algorithm_config import AlgorithmConfig - from ray.rllib.env import BaseEnv, MultiAgentEnv, VectorEnv + from ray.rllib.env import MultiAgentEnv logger = logging.getLogger(__name__) -gym, old_gym = try_import_gymnasium_and_gym() - - -@DeveloperAPI -def check_env(env: EnvType, config: Optional["AlgorithmConfig"] = None) -> None: - """Run pre-checks on env that uncover common errors in environments. - - Args: - env: Environment to be checked. - config: Additional checks config. - - Raises: - ValueError: If env is not an instance of SUPPORTED_ENVIRONMENT_TYPES. - ValueError: See check_gym_env docstring for details. - """ - from ray.rllib.algorithms.algorithm_config import AlgorithmConfig - from ray.rllib.env import ( - BaseEnv, - ExternalEnv, - ExternalMultiAgentEnv, - MultiAgentEnv, - RemoteBaseEnv, - VectorEnv, - ) - - if hasattr(env, "_skip_env_checking") and env._skip_env_checking: - # This is a work around for some environments that we already have in RLlb - # that we want to skip checking for now until we have the time to fix them. - if log_once("skip_env_checking"): - logger.warning("Skipping env checking for this experiment") - return - - try: - if not isinstance( - env, - ( - BaseEnv, - gym.Env, - MultiAgentEnv, - RemoteBaseEnv, - VectorEnv, - ExternalMultiAgentEnv, - ExternalEnv, - ActorHandle, - ), - ) and (not old_gym or not isinstance(env, old_gym.Env)): - raise ValueError( - "Env must be of one of the following supported types: BaseEnv, " - "gymnasium.Env, gym.Env, " - "MultiAgentEnv, VectorEnv, RemoteBaseEnv, ExternalMultiAgentEnv, " - f"ExternalEnv, but instead is of type {type(env)}." - ) - - if isinstance(env, MultiAgentEnv): - check_multiagent_environments(env) - elif isinstance(env, VectorEnv): - check_vector_env(env) - elif isinstance(env, gym.Env) or old_gym and isinstance(env, old_gym.Env): - check_gym_environments(env, AlgorithmConfig() if config is None else config) - elif isinstance(env, BaseEnv): - check_base_env(env) - else: - logger.warning( - "Env checking isn't implemented for RemoteBaseEnvs, " - "ExternalMultiAgentEnv, ExternalEnvs or environments that are " - "Ray actors." - ) - except Exception: - actual_error = traceback.format_exc() - raise ValueError( - f"{actual_error}\n" - "The above error has been found in your environment! " - "We've added a module for checking your custom environments. It " - "may cause your experiment to fail if your environment is not set up " - "correctly. You can disable this behavior via calling `config." - "environment(disable_env_checking=True)`. You can run the " - "environment checking module standalone by calling " - "ray.rllib.utils.check_env([your env])." - ) - - -@DeveloperAPI -def check_gym_environments( - env: Union[gym.Env, "old_gym.Env"], config: "AlgorithmConfig" -) -> None: - """Checking for common errors in a gymnasium/gym environments. - - Args: - env: Environment to be checked. - config: Additional checks config. - - Warning: - If env has no attribute spec with a sub attribute, - max_episode_steps. - - Raises: - AttributeError: If env has no observation space. - AttributeError: If env has no action space. - ValueError: Observation space must be a gym.spaces.Space. - ValueError: Action space must be a gym.spaces.Space. - ValueError: Observation sampled from observation space must be - contained in the observation space. - ValueError: Action sampled from action space must be - contained in the observation space. - ValueError: If env cannot be resetted. - ValueError: If an observation collected from a call to env.reset(). - is not contained in the observation_space. - ValueError: If env cannot be stepped via a call to env.step(). - ValueError: If the observation collected from env.step() is not - contained in the observation_space. - AssertionError: If env.step() returns a reward that is not an - int or float. - AssertionError: IF env.step() returns a done that is not a bool. - AssertionError: If env.step() returns an env_info that is not a dict. - """ - - # Check for old gym.Env. - if old_gym and isinstance(env, old_gym.Env): - raise ValueError(ERR_MSG_OLD_GYM_API.format(env, "")) - - # Check that env has observation and action spaces. - if not hasattr(env, "observation_space"): - raise AttributeError("Env must have observation_space.") - if not hasattr(env, "action_space"): - raise AttributeError("Env must have action_space.") - - # check that observation and action spaces are gym.spaces - if not isinstance(env.observation_space, gym.spaces.Space): - raise ValueError("Observation space must be a gymnasium.Space!") - if not isinstance(env.action_space, gym.spaces.Space): - raise ValueError("Action space must be a gymnasium.Space!") - - # Raise a warning if there isn't a max_episode_steps attribute. - if not hasattr(env, "spec") or not hasattr(env.spec, "max_episode_steps"): - if log_once("max_episode_steps"): - logger.warning( - "Your env doesn't have a .spec.max_episode_steps " - "attribute. Your horizon will default " - "to infinity, and your environment will not be " - "reset." - ) - - # check if sampled actions and observations are contained within their - # respective action and observation spaces. - - sampled_observation = env.observation_space.sample() - sampled_action = env.action_space.sample() - - # Check, whether resetting works as expected. - try: - env.reset() - except Exception as e: - raise ValueError( - "Your gymnasium.Env's `reset()` method raised an Exception!" - ) from e - - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - try: - # Important: Don't seed the env here by accident. - # User would not notice and get stuck with an always fixed seeded env. - obs_and_infos = env.reset(seed=None, options={}) - check_old_gym_env(reset_results=obs_and_infos) - except Exception as e: - raise ValueError( - ERR_MSG_OLD_GYM_API.format( - env, "In particular, the `reset()` method seems to be faulty." - ) - ) from e - reset_obs, reset_infos = obs_and_infos - - # Check if observation generated from resetting the environment is - # contained within the observation space. - if not env.observation_space.contains(reset_obs): - temp_sampled_reset_obs = convert_element_to_space_type( - reset_obs, sampled_observation - ) - if not env.observation_space.contains(temp_sampled_reset_obs): - # Find offending subspace in case we have a complex observation space. - key, space, space_type, value, value_type = _find_offending_sub_space( - env.observation_space, temp_sampled_reset_obs - ) - raise ValueError( - "The observation collected from env.reset() was not " - "contained within your env's observation space. It is possible " - "that there was a type mismatch, or that one of the " - "sub-observations was out of bounds:\n {}(sub-)obs: {} ({})" - "\n (sub-)observation space: {} ({})".format( - ("path: '" + key + "'\n ") if key else "", - value, - value_type, - space, - space_type, - ) - ) - # sample a valid action in case of parametric actions - if isinstance(reset_obs, dict): - if config.action_mask_key in reset_obs: - sampled_action = env.action_space.sample( - mask=reset_obs[config.action_mask_key] - ) - - # Check if env.step can run, and generates observations rewards, done - # signals and infos that are within their respective spaces and are of - # the correct dtypes. - try: - results = env.step(sampled_action) - except Exception as e: - raise ValueError( - "Your gymnasium.Env's `step()` method raised an Exception!" - ) from e - - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - try: - check_old_gym_env(step_results=results) - except Exception as e: - raise ValueError( - ERR_MSG_OLD_GYM_API.format( - env, "In particular, the `step()` method seems to be faulty." - ) - ) from e - next_obs, reward, done, truncated, info = results - - if not env.observation_space.contains(next_obs): - temp_sampled_next_obs = convert_element_to_space_type( - next_obs, sampled_observation - ) - if not env.observation_space.contains(temp_sampled_next_obs): - # Find offending subspace in case we have a complex observation space. - key, space, space_type, value, value_type = _find_offending_sub_space( - env.observation_space, temp_sampled_next_obs - ) - error = ( - "The observation collected from env.step(sampled_action) was not " - "contained within your env's observation space. It is possible " - "that there was a type mismatch, or that one of the " - "sub-observations was out of bounds: \n\n {}(sub-)obs: {} ({})" - "\n (sub-)observation space: {} ({})".format( - ("path='" + key + "'\n ") if key else "", - value, - value_type, - space, - space_type, - ) - ) - raise ValueError(error) - _check_done_and_truncated(done, truncated) - _check_reward(reward) - _check_info(info) - @DeveloperAPI def check_multiagent_environments(env: "MultiAgentEnv") -> None: @@ -305,9 +47,6 @@ def check_multiagent_environments(env: "MultiAgentEnv") -> None: try: obs_and_infos = env.reset(seed=42, options={}) - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - check_old_gym_env(reset_results=obs_and_infos) except Exception as e: raise ValueError( ERR_MSG_OLD_GYM_API.format( @@ -361,9 +100,6 @@ def check_multiagent_environments(env: "MultiAgentEnv") -> None: try: results = env.step(sampled_action) - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - check_old_gym_env(step_results=results) except Exception as e: raise ValueError( ERR_MSG_OLD_GYM_API.format( @@ -395,235 +131,6 @@ def check_multiagent_environments(env: "MultiAgentEnv") -> None: raise ValueError(error) -@DeveloperAPI -def check_base_env(env: "BaseEnv") -> None: - """Checking for common errors in RLlib BaseEnvs. - - Args: - env: The env to be checked. - """ - from ray.rllib.env import BaseEnv - - if not isinstance(env, BaseEnv): - raise ValueError("The passed env is not a BaseEnv.") - - try: - obs_and_infos = env.try_reset(seed=42, options={}) - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - check_old_gym_env(reset_results=obs_and_infos) - except Exception as e: - raise ValueError( - ERR_MSG_OLD_GYM_API.format( - env, "In particular, the `try_reset()` method seems to be faulty." - ) - ) from e - reset_obs, reset_infos = obs_and_infos - - sampled_obs = env.observation_space_sample() - _check_if_multi_env_dict(env, reset_obs, "try_reset") - _check_if_multi_env_dict(env, sampled_obs, "observation_space_sample()") - - try: - env.observation_space_contains(reset_obs) - except Exception as e: - raise ValueError( - "Your observation_space_contains function has some error " - ) from e - - if not env.observation_space_contains(reset_obs): - error = ( - _not_contained_error("try_reset", "observation") - + f": \n\n reset_obs: {reset_obs}\n\n " - f"env.observation_space_sample(): {sampled_obs}\n\n " - ) - raise ValueError(error) - - if not env.observation_space_contains(sampled_obs): - error = ( - _not_contained_error("observation_space_sample", "observation") - + f": \n\n sampled_obs: {sampled_obs}\n\n " - ) - raise ValueError(error) - - sampled_action = env.action_space_sample() - try: - env.action_space_contains(sampled_action) - except Exception as e: - raise ValueError("Your action_space_contains function has some error ") from e - if not env.action_space_contains(sampled_action): - error = ( - _not_contained_error("action_space_sample", "action") - + f": \n\n sampled_action {sampled_action}\n\n" - ) - raise ValueError(error) - _check_if_multi_env_dict(env, sampled_action, "action_space_sample()") - - env.send_actions(sampled_action) - - try: - results = env.poll() - # No more gym < 0.26 support! Error and explain the user how to upgrade to - # gymnasium. - check_old_gym_env(step_results=results[:-1]) - except Exception as e: - raise ValueError( - ERR_MSG_OLD_GYM_API.format( - env, "In particular, the `poll()` method seems to be faulty." - ) - ) from e - next_obs, reward, done, truncated, info, _ = results - - _check_if_multi_env_dict(env, next_obs, "step, next_obs") - _check_if_multi_env_dict(env, reward, "step, reward") - _check_if_multi_env_dict(env, done, "step, done") - _check_if_multi_env_dict(env, truncated, "step, truncated") - _check_if_multi_env_dict(env, info, "step, info") - - if not env.observation_space_contains(next_obs): - error = ( - _not_contained_error("poll", "observation") - + f": \n\n reset_obs: {reset_obs}\n\n env.step():{next_obs}\n\n" - ) - raise ValueError(error) - - _check_reward(reward, base_env=True, agent_ids=env.get_agent_ids()) - _check_done_and_truncated( - done, - truncated, - base_env=True, - agent_ids=env.get_agent_ids(), - ) - _check_info(info, base_env=True, agent_ids=env.get_agent_ids()) - - -@DeveloperAPI -def check_vector_env(env: "VectorEnv") -> None: - """Checking for common errors in RLlib VectorEnvs. - - Args: - env: The env to be checked. - """ - sampled_obs = env.observation_space.sample() - - # Test `vector_reset()`. - try: - vector_reset = env.vector_reset( - seeds=[42] * env.num_envs, - options=[{}] * env.num_envs, - ) - except Exception as e: - raise ValueError( - "Your Env's `vector_reset()` method has some error! Make sure it expects a " - "list of `seeds` (int) as well as a list of `options` dicts as optional, " - "named args, e.g. def vector_reset(self, index: int, *, seeds: " - "Optional[List[int]] = None, options: Optional[List[dict]] = None)" - ) from e - - if not isinstance(vector_reset, tuple) or len(vector_reset) != 2: - raise ValueError( - "The `vector_reset()` method of your env must return a Tuple[obs, infos] as" - f" of gym>=0.26! Your method returned: {vector_reset}." - ) - reset_obs, reset_infos = vector_reset - if not isinstance(reset_obs, list) or len(reset_obs) != env.num_envs: - raise ValueError( - "The observations returned by your env's `vector_reset()` method is NOT a " - f"list or do not contain exactly `num_envs` ({env.num_envs}) items! " - f"Your observations were: {reset_obs}" - ) - if not isinstance(reset_infos, list) or len(reset_infos) != env.num_envs: - raise ValueError( - "The infos returned by your env's `vector_reset()` method is NOT a " - f"list or do not contain exactly `num_envs` ({env.num_envs}) items! " - f"Your infos were: {reset_infos}" - ) - try: - env.observation_space.contains(reset_obs[0]) - except Exception as e: - raise ValueError( - "Your `observation_space.contains` function has some error!" - ) from e - if not env.observation_space.contains(reset_obs[0]): - error = ( - _not_contained_error("vector_reset", "observation") - + f": \n\n reset_obs: {reset_obs}\n\n " - f"env.observation_space.sample(): {sampled_obs}\n\n " - ) - raise ValueError(error) - - # Test `reset_at()`. - try: - reset_at = env.reset_at(index=0, seed=42, options={}) - except Exception as e: - raise ValueError( - "Your Env's `reset_at()` method has some error! Make sure it expects a " - "vector index (int) and an optional seed (int) as args." - ) from e - if not isinstance(reset_at, tuple) or len(reset_at) != 2: - raise ValueError( - "The `reset_at()` method of your env must return a Tuple[obs, infos] as " - f"of gym>=0.26! Your method returned: {reset_at}." - ) - reset_obs, reset_infos = reset_at - if not isinstance(reset_infos, dict): - raise ValueError( - "The `reset_at()` method of your env must return an info dict as second " - f"return value! Your method returned {reset_infos}" - ) - if not env.observation_space.contains(reset_obs): - error = ( - _not_contained_error("try_reset", "observation") - + f": \n\n reset_obs: {reset_obs}\n\n " - f"env.observation_space.sample(): {sampled_obs}\n\n " - ) - raise ValueError(error) - - # Test `observation_space_sample()` and `observation_space_contains()`: - if not env.observation_space.contains(sampled_obs): - error = ( - _not_contained_error("observation_space.sample()", "observation") - + f": \n\n sampled_obs: {sampled_obs}\n\n " - ) - raise ValueError(error) - - # Test `vector_step()` - sampled_action = env.action_space.sample() - if not env.action_space.contains(sampled_action): - error = ( - _not_contained_error("action_space.sample()", "action") - + f": \n\n sampled_action {sampled_action}\n\n" - ) - raise ValueError(error) - - step_results = env.vector_step([sampled_action for _ in range(env.num_envs)]) - if not isinstance(step_results, tuple) or len(step_results) != 5: - raise ValueError( - "The `vector_step()` method of your env must return a Tuple[" - "List[obs], List[rewards], List[terminateds], List[truncateds], " - f"List[infos]] as of gym>=0.26! Your method returned: {step_results}." - ) - - obs, rewards, terminateds, truncateds, infos = step_results - - _check_if_vetor_env_list(env, obs, "step, obs") - _check_if_vetor_env_list(env, rewards, "step, rewards") - _check_if_vetor_env_list(env, terminateds, "step, terminateds") - _check_if_vetor_env_list(env, truncateds, "step, truncateds") - _check_if_vetor_env_list(env, infos, "step, infos") - - if not env.observation_space.contains(obs[0]): - error = ( - _not_contained_error("vector_step", "observation") - + f": \n\n obs: {obs[0]}\n\n env.vector_step():{obs}\n\n" - ) - raise ValueError(error) - - _check_reward(rewards[0], base_env=False) - _check_done_and_truncated(terminateds[0], truncateds[0], base_env=False) - _check_info(infos[0], base_env=False) - - def _check_reward(reward, base_env=False, agent_ids=None): if base_env: for _, multi_agent_dict in reward.items(): @@ -729,26 +236,6 @@ def _not_contained_error(func_name, _type): return _error -def _check_if_multi_env_dict(env, element, function_string): - if not isinstance(element, dict): - raise ValueError( - f"The element returned by {function_string} is not a " - f"MultiEnvDict. Instead, it is of type: {type(element)}" - ) - env_ids = env.get_sub_environments(as_dict=True).keys() - if not all(k in env_ids for k in element): - raise ValueError( - f"The element returned by {function_string} " - f"has dict keys that don't correspond to " - f"environment ids for this env " - f"{list(env_ids)}" - ) - for _, multi_agent_dict in element.items(): - _check_if_element_multi_agent_dict( - env, multi_agent_dict, function_string, base_env=True - ) - - def _check_if_element_multi_agent_dict( env, element, @@ -797,16 +284,6 @@ def _check_if_element_multi_agent_dict( raise ValueError(error) -def _check_if_vetor_env_list(env, element, function_string): - if not isinstance(element, list) or len(element) != env.num_envs: - raise ValueError( - f"The element returned by {function_string} is not a " - f"list OR the length of the returned list is not the same as the number of " - f"sub-environments ({env.num_envs}) in your VectorEnv! " - f"Instead, your {function_string} returned {element}" - ) - - def _find_offending_sub_space(space, value): """Returns error, value, and space when offending `space.contains(value)` fails. diff --git a/rllib/utils/serialization.py b/rllib/utils/serialization.py index 4e703500de50..a3b6975c00b2 100644 --- a/rllib/utils/serialization.py +++ b/rllib/utils/serialization.py @@ -5,11 +5,11 @@ import zlib from typing import Any, Dict, Optional, Sequence, Type, Union +import gymnasium as gym import numpy as np import ray from ray.rllib.utils.annotations import DeveloperAPI -from ray.rllib.utils.gym import try_import_gymnasium_and_gym from ray.rllib.utils.error import NotSerializable from ray.rllib.utils.spaces.flexdict import FlexDict from ray.rllib.utils.spaces.repeated import Repeated @@ -17,12 +17,6 @@ NOT_SERIALIZABLE = "__not_serializable__" -gym, old_gym = try_import_gymnasium_and_gym() - -old_gym_text_class = None -if old_gym: - old_gym_text_class = getattr(old_gym.spaces, "Text", None) - @DeveloperAPI def convert_numpy_to_python_primitives(obj: Any): @@ -197,19 +191,6 @@ def _text(sp: "gym.spaces.Text") -> Dict: return _repeated(space) elif isinstance(space, FlexDict): return _flex_dict(space) - # Old gym Spaces. - elif old_gym and isinstance(space, old_gym.spaces.Box): - return _box(space) - elif old_gym and isinstance(space, old_gym.spaces.Discrete): - return _discrete(space) - elif old_gym and isinstance(space, old_gym.spaces.MultiDiscrete): - return _multi_discrete(space) - elif old_gym and isinstance(space, old_gym.spaces.Tuple): - return _tuple(space) - elif old_gym and isinstance(space, old_gym.spaces.Dict): - return _dict(space) - elif old_gym and old_gym_text_class and isinstance(space, old_gym_text_class): - return _text(space) else: raise ValueError("Unknown space type for serialization, ", type(space)) diff --git a/rllib/utils/tests/test_check_env.py b/rllib/utils/tests/test_check_env.py deleted file mode 100644 index 0c7958255d65..000000000000 --- a/rllib/utils/tests/test_check_env.py +++ /dev/null @@ -1,368 +0,0 @@ -import logging -import unittest -from unittest.mock import MagicMock, Mock - -import gymnasium as gym -import numpy as np -import pytest -from gymnasium.spaces import Box, Dict, Discrete - -from ray.rllib.env.base_env import convert_to_base_env -from ray.rllib.env.multi_agent_env import MultiAgentEnvWrapper, make_multi_agent -from ray.rllib.examples.envs.classes.parametric_actions_cartpole import ( - ParametricActionsCartPole, -) -from ray.rllib.examples.envs.classes.random_env import RandomEnv -from ray.rllib.utils.pre_checks.env import ( - check_base_env, - check_env, - check_gym_environments, - check_multiagent_environments, -) - - -class TestGymCheckEnv(unittest.TestCase): - @pytest.fixture(autouse=True) - def inject_fixtures(self, caplog): - caplog.set_level(logging.CRITICAL) - - def test_has_observation_and_action_space(self): - env = Mock(spec=[]) - with pytest.raises(AttributeError, match="Env must have observation_space."): - check_gym_environments(env, Mock()) - env = Mock(spec=["observation_space"]) - with pytest.raises(AttributeError, match="Env must have action_space."): - check_gym_environments(env, Mock()) - - def test_obs_and_action_spaces_are_gym_spaces(self): - env = RandomEnv() - observation_space = env.observation_space - env.observation_space = "not a gym space" - with pytest.raises(ValueError, match="Observation space must be a gym.space"): - check_env(env) - env.observation_space = observation_space - env.action_space = "not an action space" - with pytest.raises(ValueError, match="Action space must be a gym.space"): - check_env(env) - - def test_reset(self): - reset = MagicMock(return_value=5) - env = RandomEnv() - env.reset = reset - # Check reset with out of bounds fails. - error = ".*The observation collected from env.reset().*" - with pytest.raises(ValueError, match=error): - check_env(env) - - # Check reset with obs of incorrect type fails. - reset = MagicMock(return_value=float(0.1)) - env.reset = reset - with pytest.raises(ValueError, match=error): - check_env(env) - - # Check reset with complex obs in which one sub-space is incorrect. - env = RandomEnv( - config={ - "observation_space": Dict( - {"a": Discrete(4), "b": Box(-1.0, 1.0, (1,))} - ), - } - ) - reset = MagicMock(return_value={"a": float(0.1), "b": np.array([0.5])}) - error = ".*The observation collected from env.reset.*\\n path: 'a'.*" - env.reset = reset - self.assertRaisesRegex(ValueError, error, lambda: check_env(env)) - - def test_step(self): - step = MagicMock(return_value=(5, 5, True, {})) - env = RandomEnv() - env.step = step - error = ".*The observation collected from env.step.*" - with pytest.raises(ValueError, match=error): - check_env(env) - - # check reset that returns obs of incorrect type fails - step = MagicMock(return_value=(float(0.1), 5, True, {})) - env.step = step - with pytest.raises(ValueError, match=error): - check_env(env) - - # check step that returns reward of non float/int fails - step = MagicMock(return_value=(1, "Not a valid reward", True, {})) - env.step = step - error = "Your step function must return a reward that is integer or float." - with pytest.raises(ValueError, match=error): - check_env(env) - - # check step that returns a non bool fails - step = MagicMock(return_value=(1, float(5), "not a valid done signal", {})) - env.step = step - error = "Your step function must return a done that is a boolean." - with pytest.raises(ValueError, match=error): - check_env(env) - - # check step that returns a non dict fails - step = MagicMock(return_value=(1, float(5), True, "not a valid env info")) - env.step = step - error = "Your step function must return a info that is a dict." - with pytest.raises(ValueError, match=error): - check_env(env) - - def test_parametric_actions(self): - env = ParametricActionsCartPole(10) - check_env(env) - - -class TestCheckMultiAgentEnv(unittest.TestCase): - @pytest.fixture(autouse=True) - def inject_fixtures(self, caplog): - caplog.set_level(logging.CRITICAL) - - def test_check_env_not_correct_type_error(self): - env = RandomEnv() - with pytest.raises(ValueError, match="The passed env is not"): - check_multiagent_environments(env) - - def test_check_env_reset_incorrect_error(self): - reset = MagicMock(return_value=5) - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - env.reset = reset - with pytest.raises(ValueError, match="The element returned by reset"): - check_env(env) - bad_obs = { - 0: np.array([np.inf, np.inf, np.inf, np.inf]), - 1: np.array([np.inf, np.inf, np.inf, np.inf]), - } - env.reset = lambda *_: bad_obs - with pytest.raises(ValueError, match="The observation collected from env"): - check_env(env) - - def test_check_incorrect_space_contains_functions_error(self): - def bad_contains_function(self, x): - raise ValueError("This is a bad contains function") - - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - env.observation_space_contains = bad_contains_function - with pytest.raises( - ValueError, match="Your observation_space_contains function has some" - ): - check_env(env) - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - bad_action = {0: 2, 1: 2} - env.action_space_sample = lambda *_: bad_action - with pytest.raises( - ValueError, match="The action collected from action_space_sample" - ): - check_env(env) - - env.action_space_contains = bad_contains_function - with pytest.raises( - ValueError, match="Your action_space_contains function has some error" - ): - check_env(env) - - def test_check_env_step_incorrect_error(self): - step = MagicMock(return_value=(5, 5, True, {})) - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - sampled_obs, info = env.reset() - env.step = step - with pytest.raises(ValueError, match="The element returned by step"): - check_env(env) - - step = MagicMock(return_value=(sampled_obs, {0: "Not a reward"}, {0: True}, {})) - env.step = step - with pytest.raises(ValueError, match="Your step function must return rewards"): - check_env(env) - step = MagicMock(return_value=(sampled_obs, {0: 5}, {0: "Not a bool"}, {})) - env.step = step - with pytest.raises(ValueError, match="Your step function must return dones"): - check_env(env) - - step = MagicMock( - return_value=(sampled_obs, {0: 5}, {0: False}, {0: "Not a Dict"}) - ) - env.step = step - with pytest.raises(ValueError, match="Your step function must return infos"): - check_env(env) - - def test_bad_sample_function(self): - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - bad_action = {0: 2, 1: 2} - env.action_space_sample = lambda *_: bad_action - with pytest.raises( - ValueError, match="The action collected from action_space_sample" - ): - check_env(env) - env = make_multi_agent("CartPole-v1")({"num_agents": 2}) - bad_obs = { - 0: np.array([np.inf, np.inf, np.inf, np.inf]), - 1: np.array([np.inf, np.inf, np.inf, np.inf]), - } - env.observation_space_sample = lambda *_: bad_obs - with pytest.raises( - ValueError, - match="The observation collected from observation_space_sample", - ): - check_env(env) - - -class TestCheckBaseEnv: - def _make_base_env(self): - del self - num_envs = 2 - sub_envs = [ - make_multi_agent("CartPole-v1")({"num_agents": 2}) for _ in range(num_envs) - ] - env = MultiAgentEnvWrapper(None, sub_envs, 2) - return env - - def test_check_env_not_correct_type_error(self): - env = RandomEnv() - with pytest.raises(ValueError, match="The passed env is not"): - check_base_env(env) - - def test_check_env_reset_incorrect_error(self): - reset = MagicMock(return_value=5) - env = self._make_base_env() - env.try_reset = reset - with pytest.raises(ValueError, match=("MultiEnvDict. Instead, it is of type")): - check_env(env) - obs_with_bad_agent_ids = { - 2: np.array([np.inf, np.inf, np.inf, np.inf]), - 1: np.array([np.inf, np.inf, np.inf, np.inf]), - } - obs_with_bad_env_ids = {"bad_env_id": obs_with_bad_agent_ids} - reset = MagicMock(return_value=obs_with_bad_env_ids) - env.try_reset = reset - with pytest.raises(ValueError, match="has dict keys that don't correspond to"): - check_env(env) - reset = MagicMock(return_value={0: obs_with_bad_agent_ids}) - env.try_reset = reset - - with pytest.raises( - ValueError, - match="The element returned by " - "try_reset has agent_ids that are" - " not the names of the agents", - ): - check_env(env) - out_of_bounds_obs = { - 0: { - 0: np.array([np.inf, np.inf, np.inf, np.inf]), - 1: np.array([np.inf, np.inf, np.inf, np.inf]), - } - } - env.try_reset = lambda *_: out_of_bounds_obs - with pytest.raises( - ValueError, match="The observation collected from try_reset" - ): - check_env(env) - - def test_check_space_contains_functions_errors(self): - def bad_contains_function(self, x): - raise ValueError("This is a bad contains function") - - env = self._make_base_env() - - env.observation_space_contains = bad_contains_function - with pytest.raises( - ValueError, match="Your observation_space_contains function has some" - ): - check_env(env) - - env = self._make_base_env() - env.action_space_contains = bad_contains_function - with pytest.raises( - ValueError, match="Your action_space_contains function has some error" - ): - check_env(env) - - def test_bad_sample_function(self): - env = self._make_base_env() - bad_action = {0: {0: 2, 1: 2}} - env.action_space_sample = lambda *_: bad_action - with pytest.raises( - ValueError, match="The action collected from action_space_sample" - ): - check_env(env) - - env = self._make_base_env() - bad_obs = { - 0: { - 0: np.array([np.inf, np.inf, np.inf, np.inf]), - 1: np.array([np.inf, np.inf, np.inf, np.inf]), - } - } - env.observation_space_sample = lambda *_: bad_obs - with pytest.raises( - ValueError, - match="The observation collected from observation_space_sample", - ): - check_env(env) - - def test_check_env_step_incorrect_error(self): - good_reward = {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}} - good_terminated = {0: {0: False, 1: False}, 1: {0: False, 1: False}} - good_info = {0: {0: {}, 1: {}}, 1: {0: {}, 1: {}}} - - env = self._make_base_env() - bad_multi_env_dict_obs = {0: 1, 1: {0: np.zeros(4)}} - poll = MagicMock( - return_value=( - bad_multi_env_dict_obs, - good_reward, - good_terminated, - good_info, - {}, - ) - ) - env.poll = poll - with pytest.raises( - ValueError, - match="The element returned by step, " - "next_obs contains values that are not" - " MultiAgentDicts", - ): - check_env(env) - - bad_reward = {0: {0: "not_reward", 1: 1}} - good_obs = env.observation_space_sample() - poll = MagicMock( - return_value=(good_obs, bad_reward, good_terminated, good_info, {}) - ) - env.poll = poll - with pytest.raises( - ValueError, match="Your step function must return rewards that are" - ): - check_env(env) - bad_terminated = {0: {0: "not_terminated", 1: False}} - poll = MagicMock( - return_value=(good_obs, good_reward, bad_terminated, good_info, {}) - ) - env.poll = poll - with pytest.raises( - ValueError, - match="Your step function must return `terminateds` that are boolean.", - ): - check_env(env) - bad_info = {0: {0: "not_info", 1: {}}} - poll = MagicMock( - return_value=(good_obs, good_reward, good_terminated, bad_info, {}) - ) - env.poll = poll - with pytest.raises( - ValueError, - match="Your step function must return infos that are a dict.", - ): - check_env(env) - - def test_check_correct_env(self): - env = self._make_base_env() - check_env(env) - env = gym.make("CartPole-v1") - env = convert_to_base_env(env) - check_env(env) - - -if __name__ == "__main__": - pytest.main() From 52fd2a709d4d58dffac34c7a4516c05c86242c9d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 22 Apr 2024 12:43:17 +0200 Subject: [PATCH 02/15] wip Signed-off-by: sven1977 --- doc/source/rllib/doc_code/advanced_api.py | 2 +- doc/source/rllib/doc_code/getting_started.py | 2 +- doc/source/rllib/doc_code/new_api_stack.py | 6 +- doc/source/rllib/doc_code/rllib_in_60s.py | 2 +- .../rllib/doc_code/rllib_on_ray_readme.py | 2 +- doc/source/rllib/doc_code/training.py | 2 +- doc/source/rllib/rllib-advanced-api.rst | 4 +- doc/source/rllib/rllib-connector.rst | 2 +- rllib/README.rst | 6 +- rllib/algorithms/algorithm.py | 22 +- rllib/algorithms/algorithm_config.py | 375 +++++++++--------- rllib/algorithms/appo/appo.py | 4 +- rllib/algorithms/appo/tests/test_appo.py | 18 +- .../appo/tests/test_appo_learner.py | 12 +- .../appo/tests/test_appo_off_policyness.py | 2 +- rllib/algorithms/bc/tests/test_bc.py | 4 +- rllib/algorithms/cql/cql.py | 2 +- rllib/algorithms/cql/tests/test_cql.py | 4 +- rllib/algorithms/dqn/dqn.py | 45 +-- rllib/algorithms/dqn/tests/test_dqn.py | 10 +- rllib/algorithms/dreamerv3/README.md | 2 +- rllib/algorithms/dreamerv3/dreamerv3.py | 6 +- .../algorithms/dreamerv3/utils/env_runner.py | 8 +- rllib/algorithms/impala/impala.py | 18 +- rllib/algorithms/impala/tests/test_impala.py | 4 +- .../impala/tests/test_impala_learner.py | 4 +- .../tests/test_impala_off_policyness.py | 4 +- rllib/algorithms/marwil/tests/test_marwil.py | 10 +- rllib/algorithms/ppo/ppo.py | 4 +- rllib/algorithms/ppo/tests/test_ppo.py | 22 +- .../algorithms/ppo/tests/test_ppo_learner.py | 14 +- .../ppo/tests/test_ppo_with_env_runner.py | 4 +- .../ppo/tests/test_ppo_with_rl_module.py | 12 +- rllib/algorithms/ppo/tests/test_repro_ppo.py | 4 +- rllib/algorithms/sac/rnnsac.py | 2 +- rllib/algorithms/sac/sac.py | 10 +- rllib/algorithms/sac/tests/test_rnnsac.py | 2 +- rllib/algorithms/sac/tests/test_sac.py | 6 +- rllib/algorithms/tests/test_algorithm.py | 14 +- .../algorithms/tests/test_algorithm_config.py | 20 +- .../tests/test_callbacks_old_stack.py | 10 +- .../tests/test_callbacks_on_algorithm.py | 2 +- .../tests/test_callbacks_on_env_runner.py | 6 +- rllib/algorithms/tests/test_memory_leaks.py | 4 +- .../algorithms/tests/test_worker_failures.py | 56 +-- .../run_ppo_with_inference_bm.py | 6 +- rllib/connectors/tests/test_agent.py | 2 +- rllib/env/multi_agent_env_runner.py | 2 +- rllib/env/policy_server_input.py | 8 +- rllib/env/single_agent_env_runner.py | 2 +- rllib/env/tests/test_multi_agent_env.py | 38 +- .../env/tests/test_multi_agent_env_runner.py | 2 +- .../env/tests/test_single_agent_env_runner.py | 4 +- rllib/env/wrappers/model_vector_env.py | 4 +- rllib/evaluate.py | 4 +- rllib/evaluation/rollout_worker.py | 12 +- rllib/evaluation/tests/test_env_runner_v2.py | 48 +-- .../evaluation/tests/test_envs_that_crash.py | 22 +- rllib/evaluation/tests/test_episode.py | 8 +- rllib/evaluation/tests/test_episode_v2.py | 6 +- rllib/evaluation/tests/test_rollout_worker.py | 96 ++--- .../tests/test_trajectory_view_api.py | 16 +- rllib/evaluation/tests/test_worker_set.py | 6 +- rllib/evaluation/worker_set.py | 2 +- rllib/examples/_docs/rllib_on_rllib_readme.py | 2 +- .../_old_api_stack/complex_struct_space.py | 2 +- .../connectors/prepare_checkpoint.py | 4 +- .../self_play_with_policy_checkpoint.py | 2 +- .../remote_base_env_with_custom_api.py | 2 +- ...e_envs_with_inference_done_on_main_node.py | 2 +- ...raining_step_on_and_off_policy_combined.py | 2 +- rllib/examples/cartpole_lstm.py | 2 +- .../examples/catalogs/mobilenet_v2_encoder.py | 2 +- rllib/examples/centralized_critic.py | 2 +- rllib/examples/centralized_critic_2.py | 4 +- rllib/examples/checkpoints/onnx_tf.py | 2 +- rllib/examples/checkpoints/onnx_torch.py | 2 +- rllib/examples/connectors/frame_stacking.py | 4 +- .../examples/connectors/mean_std_filtering.py | 6 +- .../connectors/nested_action_spaces.py | 2 +- .../connectors/nested_observation_spaces.py | 2 +- .../connectors/prev_actions_prev_rewards.py | 2 +- .../curriculum/curriculum_learning.py | 2 +- .../examples/custom_metrics_and_callbacks.py | 2 +- .../examples/custom_model_loss_and_metrics.py | 2 +- .../custom_recurrent_rnn_tokenizer.py | 2 +- .../debugging/deterministic_training.py | 2 +- .../envs/env_rendering_and_recording.py | 4 +- .../envs/external_envs/cartpole_server.py | 2 +- .../envs/external_envs/unity3d_server.py | 2 +- rllib/examples/envs/greyscale_env.py | 2 +- rllib/examples/envs/unity3d_env_local.py | 4 +- .../examples/evaluation/custom_evaluation.py | 2 +- .../evaluation_parallel_to_training.py | 6 +- rllib/examples/gpus/fractional_gpus.py | 2 +- .../hierarchical/hierarchical_training.py | 4 +- .../multi_agent/multi_agent_cartpole.py | 2 +- .../multi_agent/multi_agent_pendulum.py | 2 +- ...ock_paper_scissors_heuristic_vs_learned.py | 2 +- .../rock_paper_scissors_learned_vs_learned.py | 2 +- .../self_play_league_based_with_open_spiel.py | 2 +- .../multi_agent/self_play_with_open_spiel.py | 2 +- rllib/examples/multi_agent/two_algorithms.py | 4 +- .../two_step_game_with_grouped_agents.py | 2 +- rllib/examples/offline_rl/custom_input_api.py | 2 +- rllib/examples/offline_rl/offline_rl.py | 4 +- rllib/examples/ray_tune/custom_experiment.py | 4 +- rllib/examples/ray_tune/custom_logger.py | 2 +- .../ray_tune/custom_progress_reporter.py | 2 +- rllib/examples/replay_buffer_api.py | 2 +- .../rl_modules/classes/mobilenet_rlm.py | 2 +- rllib/execution/rollout_ops.py | 2 +- rllib/models/tests/test_attention_nets.py | 2 +- rllib/models/tests/test_lstms.py | 4 +- rllib/models/tests/test_models.py | 2 +- rllib/models/tests/test_preprocessors.py | 4 +- rllib/offline/estimators/tests/test_ope.py | 16 +- rllib/offline/estimators/tests/utils.py | 2 +- rllib/offline/tests/test_dataset_reader.py | 2 +- .../tests/test_compute_log_likelihoods.py | 4 +- .../tests/test_policy_checkpoint_restore.py | 8 +- .../backward_compat/test_backward_compat.py | 4 +- .../test_algorithm_checkpoint_restore.py | 8 +- .../tests/test_algorithm_rl_module_restore.py | 4 +- ..._algorithm_save_load_checkpoint_learner.py | 4 +- rllib/tests/test_custom_resource.py | 2 +- rllib/tests/test_dependency_tf.py | 2 +- rllib/tests/test_dependency_torch.py | 2 +- rllib/tests/test_gpus.py | 4 +- rllib/tests/test_io.py | 2 +- rllib/tests/test_local.py | 2 +- rllib/tests/test_lstm.py | 4 +- rllib/tests/test_model_imports.py | 2 +- rllib/tests/test_nested_observation_spaces.py | 8 +- rllib/tests/test_node_failure.py | 6 +- rllib/tests/test_pettingzoo_env.py | 6 +- rllib/tests/test_placement_groups.py | 8 +- rllib/tests/test_rllib_train_and_evaluate.py | 2 +- rllib/tests/test_supported_multi_agent.py | 4 +- rllib/tests/test_supported_spaces.py | 4 +- rllib/tests/test_timesteps.py | 2 +- .../appo/cartpole-appo-fake-gpus.yaml | 2 +- .../appo/cartpole-appo-separate-losses.py | 4 +- ...artpole-appo-w-rl-modules-and-learner.yaml | 2 +- rllib/tuned_examples/appo/cartpole-appo.yaml | 2 +- ...hing-and-stalling-recreate-workers-appo.py | 8 +- ...cartpole-crashing-recreate-workers-appo.py | 8 +- .../appo/frozenlake-appo-vtrace.yaml | 2 +- .../tuned_examples/appo/halfcheetah-appo.yaml | 2 +- .../appo/memory-leak-test-appo.yaml | 2 +- ...hing-and-stalling-recreate-workers-appo.py | 6 +- ...cartpole-crashing-recreate-workers-appo.py | 6 +- ...ulti-agent-cartpole-w-100-policies-appo.py | 12 +- .../appo/multi_agent_cartpole_appo.py | 6 +- .../pong-appo-w-rl-modules-and-learner.yaml | 2 +- rllib/tuned_examples/appo/pong-appo.yaml | 2 +- .../appo/stateless_cartpole_appo.py | 2 +- rllib/tuned_examples/bc/cartpole-bc.yaml | 2 +- .../compact-regression-test.yaml | 10 +- rllib/tuned_examples/cql/pendulum-cql.yaml | 2 +- .../tuned_examples/dqn/benchmark_dqn_atari.py | 4 +- ...benchmark_dqn_atari_rllib_preprocessing.py | 4 +- .../dqn/cartpole_dqn_envrunner.py | 4 +- rllib/tuned_examples/dreamerv3/atari_100k.py | 4 +- rllib/tuned_examples/dreamerv3/atari_200M.py | 4 +- .../dreamerv3/dm_control_suite_vision.py | 2 +- rllib/tuned_examples/dreamerv3/flappy_bird.py | 4 +- .../dreamerv3/gymnasium_robotics.py | 4 +- rllib/tuned_examples/dreamerv3/highway_env.py | 4 +- .../impala/atari-impala-large.yaml | 2 +- .../impala/atari-impala-multi-gpu.yaml | 2 +- rllib/tuned_examples/impala/atari-impala.yaml | 2 +- .../impala/cartpole-impala-separate-losses.py | 6 +- .../impala/memory-leak-test-impala.yaml | 2 +- .../impala/multi_agent_cartpole_impala.py | 6 +- .../impala/pong-impala-fast.yaml | 2 +- .../impala/pong-impala-vectorized.yaml | 2 +- rllib/tuned_examples/impala/pong-impala.yaml | 2 +- .../marwil/cartpole-marwil.yaml | 2 +- rllib/tuned_examples/ppo/atari-ppo.yaml | 2 +- .../ppo/benchmark_ppo_mujoco.py | 4 +- .../ppo/benchmark_ppo_mujoco_pb2.py | 4 +- .../ppo/cartpole_ppo_envrunner.py | 6 +- .../ppo/cartpole_truncated_ppo.py | 4 +- rllib/tuned_examples/ppo/halfcheetah-ppo.yaml | 2 +- .../ppo/memory-leak-test-ppo.yaml | 2 +- .../ppo/memory_leak_test_ppo_new_stack.py | 6 +- .../ppo/multi_agent_pendulum_ppo_envrunner.py | 6 +- rllib/tuned_examples/ppo/pendulum-ppo.yaml | 2 +- .../ppo/pendulum-transformed-actions-ppo.yaml | 2 +- .../ppo/pendulum_ppo_envrunner.py | 8 +- rllib/tuned_examples/ppo/pong-ppo.yaml | 2 +- .../tuned_examples/ppo/recomm-sys001-ppo.yaml | 2 +- .../ppo/repeatafterme-ppo-lstm.yaml | 2 +- .../sac/benchmark_sac_mujoco.py | 4 +- .../sac/benchmark_sac_mujoco_pb2.py | 4 +- .../sac/pendulum_sac_envrunner.py | 4 +- .../utils/exploration/tests/test_curiosity.py | 32 +- .../exploration/tests/test_explorations.py | 12 +- rllib/utils/test_utils.py | 8 +- rllib/utils/tests/test_errors.py | 2 +- 201 files changed, 780 insertions(+), 792 deletions(-) diff --git a/doc/source/rllib/doc_code/advanced_api.py b/doc/source/rllib/doc_code/advanced_api.py index 438945f24cfc..bc76de100ca9 100644 --- a/doc/source/rllib/doc_code/advanced_api.py +++ b/doc/source/rllib/doc_code/advanced_api.py @@ -29,7 +29,7 @@ def get(self): # __rllib-adv_api_explore_begin__ from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -config = AlgorithmConfig().exploration( +config = AlgorithmConfig().env_runners( exploration_config={ # Special `type` key provides class information "type": "StochasticSampling", diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py index 73595f32e096..e5d742d946c5 100644 --- a/doc/source/rllib/doc_code/getting_started.py +++ b/doc/source/rllib/doc_code/getting_started.py @@ -7,7 +7,7 @@ algo = ( PPOConfig() - .rollouts(num_rollout_workers=1) + .env_runners(num_env_runners=1) .resources(num_gpus=0) .environment(env="CartPole-v1") .build() diff --git a/doc/source/rllib/doc_code/new_api_stack.py b/doc/source/rllib/doc_code/new_api_stack.py index c8260bff1209..597922bb48df 100644 --- a/doc/source/rllib/doc_code/new_api_stack.py +++ b/doc/source/rllib/doc_code/new_api_stack.py @@ -16,7 +16,7 @@ # Note that this step will be fully automated in the next release. # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and # `MultiAgentEnvRunner` for multi-agent cases. - .rollouts(env_runner_cls=SingleAgentEnvRunner) + .env_runners(env_runner_cls=SingleAgentEnvRunner) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -61,7 +61,7 @@ # Note that this step will be fully automated in the next release. # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and # `MultiAgentEnvRunner` for multi-agent cases. - .rollouts(env_runner_cls=MultiAgentEnvRunner) + .env_runners(env_runner_cls=MultiAgentEnvRunner) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -108,7 +108,7 @@ # to utilize all of the new API stack's classes, you also have to specify the # EnvRunner (replaces RolloutWorker) to use. # Note that this step will be fully automated in the next release. - .rollouts(env_runner_cls=SingleAgentEnvRunner) + .env_runners(env_runner_cls=SingleAgentEnvRunner) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and diff --git a/doc/source/rllib/doc_code/rllib_in_60s.py b/doc/source/rllib/doc_code/rllib_in_60s.py index 3f6173d3029c..3770abd66914 100644 --- a/doc/source/rllib/doc_code/rllib_in_60s.py +++ b/doc/source/rllib/doc_code/rllib_in_60s.py @@ -6,7 +6,7 @@ config = ( # 1. Configure the algorithm, PPOConfig() .environment("Taxi-v3") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .framework("torch") .training(model={"fcnet_hiddens": [64, 64]}) .evaluation(evaluation_num_workers=1) diff --git a/doc/source/rllib/doc_code/rllib_on_ray_readme.py b/doc/source/rllib/doc_code/rllib_on_ray_readme.py index 8aace3c7e4b0..0e429b79aec4 100644 --- a/doc/source/rllib/doc_code/rllib_on_ray_readme.py +++ b/doc/source/rllib/doc_code/rllib_on_ray_readme.py @@ -63,7 +63,7 @@ def step(self, action): env_config={"corridor_length": 28}, ) # Parallelize environment rollouts. - .rollouts(num_rollout_workers=3) + .env_runners(num_env_runners=3) ) # Construct the actual (PPO) algorithm object from the config. algo = config.build() diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py index c9a87e767e04..f9f64c81fa93 100644 --- a/doc/source/rllib/doc_code/training.py +++ b/doc/source/rllib/doc_code/training.py @@ -35,7 +35,7 @@ DQNConfig() .environment("CartPole-v1") .framework("tf2") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .build() ) # diff --git a/doc/source/rllib/rllib-advanced-api.rst b/doc/source/rllib/rllib-advanced-api.rst index 945dbd176c0d..a1b0d7eb038f 100644 --- a/doc/source/rllib/rllib-advanced-api.rst +++ b/doc/source/rllib/rllib-advanced-api.rst @@ -211,13 +211,13 @@ actions from distributions (stochastically or deterministically). The setup can be done using built-in Exploration classes (see `this package `__), which are specified (and further configured) inside -``AlgorithmConfig().exploration(..)``. +``AlgorithmConfig().env_runners(..)``. Besides using one of the available classes, one can sub-class any of these built-ins, add custom behavior to it, and use that new class in the config instead. Every policy has-an Exploration object, which is created from the AlgorithmConfig’s -``.exploration(exploration_config=...)`` method, which specifies the class to use through the +``.env_runners(exploration_config=...)`` method, which specifies the class to use through the special “type” key, as well as constructor arguments through all other keys, e.g.: diff --git a/doc/source/rllib/rllib-connector.rst b/doc/source/rllib/rllib-connector.rst index d62b512ed0a0..7cecdc5feef0 100644 --- a/doc/source/rllib/rllib-connector.rst +++ b/doc/source/rllib/rllib-connector.rst @@ -21,7 +21,7 @@ By consolidating these transformations under the framework of connectors, users - Allow policies to be adapted to work with different versions of an environment. - Run inference with RLlib policies without worrying about the exact trajectory view requirements or state inputs. -Connectors can be enabled by setting the ``enable_connectors`` parameter to ``True`` with ``AlgorithmConfig.rollouts()`` API. +Connectors can be enabled by setting the ``enable_connectors`` parameter to ``True`` with ``AlgorithmConfig.env_runners()`` API. Key Concepts ------------ diff --git a/rllib/README.rst b/rllib/README.rst index b5dbaf66ac84..6f67c0f2b877 100644 --- a/rllib/README.rst +++ b/rllib/README.rst @@ -161,8 +161,8 @@ Quick First Experiment "parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1, )) }, ) - # Parallelize environment rollouts. - .rollouts(num_rollout_workers=3) + # Parallelize environment sampling. + .env_runners(num_env_runners=3) ) # Use the config's `build()` method to construct a PPO object. algo = config.build() @@ -235,7 +235,7 @@ allow you to set the ``num_workers`` config parameter, such that your workloads on 100s of CPUs/nodes thus parallelizing and speeding up learning. **Vectorized (batched) and remote (parallel) environments**: RLlib auto-vectorizes -your ``gym.Envs`` via the ``num_envs_per_worker`` config. Environment workers can +your ``gym.Envs`` via the ``num_envs_per_env_runner`` config. Environment workers can then batch and thus significantly speedup the action computing forward pass. On top of that, RLlib offers the ``remote_worker_envs`` config to create `single environments (within a vectorized one) as ray Actors `_, diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 0ed94e8ba373..a1b41e7cf63f 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -622,7 +622,7 @@ def setup(self, config: AlgorithmConfig) -> None: validate_env=self.validate_env, default_policy_class=self.get_default_policy_class(self.config), config=self.config, - num_workers=self.config.num_rollout_workers, + num_workers=self.config.num_env_runners, local_worker=True, logdir=self.logdir, ) @@ -644,7 +644,7 @@ def setup(self, config: AlgorithmConfig) -> None: ) # Create a separate evaluation worker set for evaluation. - # If evaluation_num_workers=0, use the evaluation set's local + # If evaluation_num_env_runners=0, use the evaluation set's local # worker for evaluation, otherwise, use its remote workers # (parallelized evaluation). self.evaluation_workers: WorkerSet = WorkerSet( @@ -652,7 +652,7 @@ def setup(self, config: AlgorithmConfig) -> None: validate_env=None, default_policy_class=self.get_default_policy_class(self.config), config=self.evaluation_config, - num_workers=self.config.evaluation_num_workers, + num_workers=self.config.evaluation_num_env_runners, logdir=self.logdir, ) @@ -1243,7 +1243,7 @@ def _env_runner_remote(worker, num, round, iter): "Calling `sample()` on your remote evaluation worker(s) " "resulted in all workers crashing! Make sure a) your environment is not" " too unstable, b) you have enough evaluation workers " - "(`config.evaluation(evaluation_num_workers=...)`) to cover for " + "(`config.evaluation(evaluation_num_env_runners=...)`) to cover for " "occasional losses, and c) you use the `config.fault_tolerance(" "recreate_failed_workers=True)` setting." ) @@ -1280,7 +1280,7 @@ def _evaluate_with_fixed_duration(self): # How many episodes/timesteps do we need to run? unit = self.config.evaluation_duration_unit eval_cfg = self.evaluation_config - num_workers = self.config.evaluation_num_workers + num_workers = self.config.evaluation_num_env_runners force_reset = self.config.evaluation_force_reset_envs_before_iteration time_out = self.config.evaluation_sample_timeout_s @@ -1417,7 +1417,7 @@ def _env_runner_remote(worker, num, round, iter): "Calling `sample()` on your remote evaluation worker(s) " "resulted in all workers crashing! Make sure a) your environment is not" " too unstable, b) you have enough evaluation workers " - "(`config.evaluation(evaluation_num_workers=...)`) to cover for " + "(`config.evaluation(evaluation_num_env_runners=...)`) to cover for " "occasional losses, and c) you use the `config.fault_tolerance(" "recreate_failed_workers=True)` setting." ) @@ -2437,7 +2437,7 @@ def default_resource_request( "GPU": cf.num_gpus_per_worker, **cf.custom_resources_per_worker, } - for _ in range(cf.num_rollout_workers) + for _ in range(cf.num_env_runners) ] # resources for remote evaluation env samplers or datasets (if any) @@ -2450,7 +2450,7 @@ def default_resource_request( "GPU": eval_cf.num_gpus_per_worker, **eval_cf.custom_resources_per_worker, } - for _ in range(eval_cf.evaluation_num_workers) + for _ in range(eval_cf.evaluation_num_env_runners) ] else: # resources for offline dataset readers during evaluation @@ -2615,7 +2615,7 @@ def resource_help(cls, config: Union[AlgorithmConfig, AlgorithmConfigDict]) -> s "\n\nYou can adjust the resource requests of RLlib Algorithms by calling " "`AlgorithmConfig.resources(" "num_gpus=.., num_cpus_per_worker=.., num_gpus_per_worker=.., ..)` or " - "`AgorithmConfig.rollouts(num_rollout_workers=..)`. See " + "`AgorithmConfig.env_runners(num_rollout_workers=..)`. See " "the `ray.rllib.algorithms.algorithm_config.AlgorithmConfig` classes " "(each Algorithm has its own subclass of this class) for more info.\n\n" f"The config of this Algorithm is: {config}" @@ -3278,7 +3278,7 @@ def _run_offline_evaluation(self): """ assert len(self.workers.local_worker().policy_map) == 1 - parallelism = self.evaluation_config.evaluation_num_workers or 1 + parallelism = self.evaluation_config.evaluation_num_env_runners or 1 offline_eval_results = {"off_policy_estimator": {}} for evaluator_name, offline_evaluator in self.reward_estimators.items(): offline_eval_results["off_policy_estimator"][ @@ -3304,7 +3304,7 @@ def _should_create_evaluation_rollout_workers(cls, eval_config: "AlgorithmConfig and not eval_config.ope_split_batch_by_episode ) return not run_offline_evaluation and ( - eval_config.evaluation_num_workers > 0 or eval_config.evaluation_interval + eval_config.evaluation_num_env_runners > 0 or eval_config.evaluation_interval ) def _compile_iteration_results( diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index af2bd3a23c2e..d23eb0f0f056 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -129,7 +129,7 @@ class AlgorithmConfig(_Config): config = (PPOConfig().training(gamma=0.9, lr=0.01) .environment(env="CartPole-v1") .resources(num_gpus=0) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .callbacks(MemoryTrackingCallbacks) ) # A config object can be used to construct the respective Algorithm. @@ -222,7 +222,7 @@ def overrides(cls, **kwargs): config = ( PPOConfig() .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_config=AlgorithmConfig.overrides(explore=False), ) @@ -249,7 +249,14 @@ def overrides(cls, **kwargs): return config_overrides - def __init__(self, algo_class=None): + def __init__(self, algo_class: Optional[type] = None): + """Initializes an AlgorithmConfig instance. + + Args: + algo_class: An optional Algorithm class that this config class belongs to. + Used (if provided) to build a respective Algorithm instance from this + config. + """ # Define all settings and their default values. # Define the default RLlib Algorithm class that this AlgorithmConfig will be @@ -317,22 +324,20 @@ def __init__(self, algo_class=None): self.env_config = {} self.observation_space = None self.action_space = None - self.env_task_fn = None - self.render_env = False self.clip_rewards = None self.normalize_actions = True self.clip_actions = False - self.action_mask_key = "action_mask" - # Whether this env is an atari env (for atari-specific preprocessing). - # If not specified, we will try to auto-detect this. self._is_atari = None + # Deprecated settings: + self.env_task_fn = None + self.render_env = False + self.action_mask_key = "action_mask" - # TODO (sven): Rename this method into `AlgorithmConfig.sampling()` - # `self.rollouts()` + # `self.env_runners()` self.env_runner_cls = None - # TODO (sven): Rename into `num_env_runner_workers`. - self.num_rollout_workers = 0 - self.num_envs_per_worker = 1 + self.num_env_runners = 0 + self.num_envs_per_env_runner = 1 + self.validate_env_runners_after_construction = True self.sample_timeout_s = 60.0 self.create_env_on_local_worker = False self._env_to_module_connector = None @@ -346,16 +351,8 @@ def __init__(self, algo_class=None): self.rollout_fragment_length = 200 # TODO (sven): Rename into `sample_mode`. self.batch_mode = "truncate_episodes" - # TODO (sven): Rename into `validate_env_runner_workers_after_construction`. - self.validate_workers_after_construction = True self.compress_observations = False - # TODO (sven): Rename into `env_runner_perf_stats_ema_coef`. - self.sampler_perf_stats_ema_coef = None - # TODO (sven): Deprecate this setting. Connectors should always be enabled - # on new stack. - self.enable_connectors = True - - # TODO (sven): Deprecate together with old API stack. + # @OldAPIStack self.remote_worker_envs = False self.remote_env_batch_wait_ms = 0 self.enable_tf1_exec_eagerly = False @@ -364,7 +361,12 @@ def __init__(self, algo_class=None): self.observation_filter = "NoFilter" self.update_worker_filter_stats = True self.use_worker_filter_stats = True - # TODO (sven): End: deprecate. + self.enable_connectors = True + self.sampler_perf_stats_ema_coef = None + # Deprecated args. + self.num_rollout_workers = DEPRECATED_VALUE + self.num_envs_per_worker = DEPRECATED_VALUE + self.validate_workers_after_construction = DEPRECATED_VALUE # `self.training()` self.gamma = 0.99 @@ -444,7 +446,7 @@ def __init__(self, algo_class=None): self.evaluation_config = None self.off_policy_estimation_methods = {} self.ope_split_batch_by_episode = True - self.evaluation_num_workers = 0 + self.evaluation_num_env_runners = 0 self.custom_evaluation_function = None self.always_attach_evaluation_results = True # TODO: Set this flag still in the config or - much better - in the @@ -522,6 +524,7 @@ def __init__(self, algo_class=None): # TODO: Remove, once all deprecation_warning calls upon using these keys # have been removed. # === Deprecated keys === + self.evaluation_num_workers = DEPRECATED_VALUE self.simple_optimizer = DEPRECATED_VALUE self.monitor = DEPRECATED_VALUE self.evaluation_num_episodes = DEPRECATED_VALUE @@ -602,7 +605,7 @@ def to_dict(self) -> AlgorithmConfigDict: config["custom_eval_function"] = config.pop("custom_evaluation_function", None) config["framework"] = config.pop("framework_str", None) config["num_cpus_for_driver"] = config.pop("num_cpus_for_local_worker", 1) - config["num_workers"] = config.pop("num_rollout_workers", 0) + config["num_workers"] = config.pop("num_env_runners", config.pop("num_rollout_workers", 0)) # Simplify: Remove all deprecated keys that have as value `DEPRECATED_VALUE`. # These would be useless in the returned dict anyways. @@ -709,7 +712,7 @@ def update_from_dict( continue if isinstance(value, dict) and "type" in value: value["type"] = deserialize_type(value["type"]) - self.exploration(exploration_config=value) + self.env_runners(exploration_config=value) elif key == "model": # Resolve possible classpath. if isinstance(value, dict) and value.get("custom_model"): @@ -724,7 +727,7 @@ def update_from_dict( elif key == "sample_collector": # Resolve possible classpath. value = deserialize_type(value) - self.rollouts(sample_collector=value) + self.env_runners(sample_collector=value) # Set the property named `key` to `value`. else: setattr(self, key, value) @@ -880,7 +883,7 @@ def build_env_to_module_connector(self, env): # Unsupported return value. else: raise ValueError( - "`AlgorithmConfig.rollouts(env_to_module_connector=..)` must return" + "`AlgorithmConfig.env_runners(env_to_module_connector=..)` must return" " a ConnectorV2 object or a list thereof (to be added to a " f"pipeline)! Your function returned {val_}." ) @@ -946,7 +949,7 @@ def build_module_to_env_connector(self, env): # Unsupported return value. else: raise ValueError( - "`AlgorithmConfig.rollouts(module_to_env_connector=..)` must return" + "`AlgorithmConfig.env_runners(module_to_env_connector=..)` must return" " a ConnectorV2 object or a list thereof (to be added to a " f"pipeline)! Your function returned {val_}." ) @@ -1425,8 +1428,8 @@ def environment( gymnasium env, a PyBullet env, or a fully qualified classpath to an Env class, e.g. "ray.rllib.examples.envs.classes.random_env.RandomEnv". env_config: Arguments dict passed to the env creator as an EnvContext - object (which is a dict plus the properties: num_rollout_workers, - worker_index, vector_index, and remote). + object (which is a dict plus the properties: `num_env_runners`, + `worker_index`, `vector_index`, and `remote`). observation_space: The observation space for the Policies of this Algorithm. action_space: The action space for the Policies of this Algorithm. env_task_fn: A callable taking the last train results, the base env and the @@ -1434,7 +1437,7 @@ def environment( The env must be a `TaskSettableEnv` sub-class for this to work. See `examples/curriculum_learning.py` for an example. render_env: If True, try to render the environment on the local worker or on - worker 1 (if num_rollout_workers > 0). For vectorized envs, this usually + worker 1 (if num_env_runners > 0). For vectorized envs, this usually means that only the first sub-environment will be rendered. In order for this to work, your env will have to implement the `render()` method which either: @@ -1499,16 +1502,14 @@ def environment( return self - def rollouts( + def env_runners( self, *, env_runner_cls: Optional[type] = NotProvided, - num_rollout_workers: Optional[int] = NotProvided, - num_envs_per_worker: Optional[int] = NotProvided, + num_env_runners: Optional[int] = NotProvided, + num_envs_per_env_runner: Optional[int] = NotProvided, + validate_env_runners_after_construction: Optional[bool] = NotProvided, sample_timeout_s: Optional[float] = NotProvided, - create_env_on_local_worker: Optional[bool] = NotProvided, - sample_collector: Optional[Type[SampleCollector]] = NotProvided, - enable_connectors: Optional[bool] = NotProvided, env_to_module_connector: Optional[ Callable[[EnvType], Union["ConnectorV2", List["ConnectorV2"]]] ] = NotProvided, @@ -1520,16 +1521,27 @@ def rollouts( episode_lookback_horizon: Optional[int] = NotProvided, use_worker_filter_stats: Optional[bool] = NotProvided, update_worker_filter_stats: Optional[bool] = NotProvided, + compress_observations: Optional[bool] = NotProvided, rollout_fragment_length: Optional[Union[int, str]] = NotProvided, batch_mode: Optional[str] = NotProvided, - remote_worker_envs: Optional[bool] = NotProvided, - remote_env_batch_wait_ms: Optional[float] = NotProvided, - validate_workers_after_construction: Optional[bool] = NotProvided, - preprocessor_pref: Optional[str] = NotProvided, - observation_filter: Optional[str] = NotProvided, - compress_observations: Optional[bool] = NotProvided, - enable_tf1_exec_eagerly: Optional[bool] = NotProvided, - sampler_perf_stats_ema_coef: Optional[float] = NotProvided, + explore: Optional[bool] = NotProvided, + + # @OldAPIStack settings. + exploration_config: Optional[dict] = NotProvided, # @OldAPIStack + create_env_on_local_worker: Optional[bool] = NotProvided, # @OldAPIStack + sample_collector: Optional[Type[SampleCollector]] = NotProvided, # @OldAPIStack + enable_connectors: Optional[bool] = NotProvided, # @OldAPIStack + remote_worker_envs: Optional[bool] = NotProvided, # @OldAPIStack + remote_env_batch_wait_ms: Optional[float] = NotProvided, # @OldAPIStack + preprocessor_pref: Optional[str] = NotProvided, # @OldAPIStack + observation_filter: Optional[str] = NotProvided, # @OldAPIStack + enable_tf1_exec_eagerly: Optional[bool] = NotProvided, # @OldAPIStack + sampler_perf_stats_ema_coef: Optional[float] = NotProvided, # @OldAPIStack + + # Deprecated args. + num_rollout_workers=DEPRECATED_VALUE, + num_envs_per_worker=DEPRECATED_VALUE, + validate_workers_after_construction=DEPRECATED_VALUE, ignore_worker_failures=DEPRECATED_VALUE, recreate_failed_workers=DEPRECATED_VALUE, restart_failed_sub_environments=DEPRECATED_VALUE, @@ -1543,13 +1555,13 @@ def rollouts( Args: env_runner_cls: The EnvRunner class to use for environment rollouts (data collection). - num_rollout_workers: Number of rollout worker actors to create for - parallel sampling. Setting this to 0 will force rollouts to be done in - the local worker (driver process or the Algorithm's actor when using - Tune). - num_envs_per_worker: Number of environments to evaluate vector-wise per - worker. This enables model inference batching, which can improve - performance for inference bottlenecked workloads. + num_env_runners: Number of EnvRunner actors to create for parallel sampling. + Setting this to 0 will force sampling to be done in the local + EnvRunner (main process or the Algorithm's actor when using Tune). + num_envs_per_env_runner: Number of environments to step through + (vector-wise) per EnvRunner. This enables batching when computing + actions through RLModule inference, which can improve performance + for inference-bottlenecked workloads. sample_timeout_s: The timeout in seconds for calling `sample()` on remote EnvRunner workers. Results (episode list) from workers that take longer than this time are discarded. Only used by algorithms that sample @@ -1560,7 +1572,7 @@ def rollouts( be used to collect and retrieve environment-, model-, and sampler data. Override the SampleCollector base class to implement your own collection/buffering/retrieval logic. - create_env_on_local_worker: When `num_rollout_workers` > 0, the driver + create_env_on_local_worker: When `num_env_runners` > 0, the driver (local_worker; worker-idx=0) does not need an environment. This is because it doesn't have to sample (done by remote_workers; worker_indices > 0) nor evaluate (done by evaluation workers; @@ -1641,6 +1653,10 @@ def rollouts( Note that when `num_envs_per_worker > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. + explore: Default exploration behavior, iff `explore=None` is passed into + compute_action(s). Set to False for no exploration behavior (e.g., + for evaluation). + exploration_config: A dict specifying the Exploration object's config. remote_worker_envs: If using num_envs_per_worker > 1, whether to create those new envs in remote processes instead of in the same worker. This adds overheads, but can make sense if your envs can take much @@ -1650,8 +1666,8 @@ def rollouts( polling environments. 0 (continue when at least one env is ready) is a reasonable default, but optimal value could be obtained by measuring your environment step / reset and model inference perf. - validate_workers_after_construction: Whether to validate that each created - remote worker is healthy after its construction process. + validate_env_runners_after_construction: Whether to validate that each + created remote EnvRunner is healthy after its construction process. preprocessor_pref: Whether to use "rllib" or "deepmind" preprocessors by default. Set to None for using no preprocessor. In this case, the model will have to handle possibly complex observations from the @@ -1672,16 +1688,42 @@ def rollouts( Returns: This updated AlgorithmConfig object. """ + if num_rollout_workers != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.env_runners(num_rollout_workers)", + new="AlgorithmConfig.env_runners(num_env_runners)", + error=False, + ) + self.num_env_runners = num_rollout_workers + if num_envs_per_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.env_runners(num_envs_per_worker)", + new="AlgorithmConfig.env_runners(num_envs_per_env_runner)", + error=False, + ) + self.num_envs_per_env_runner = num_envs_per_worker + if validate_workers_after_construction != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.env_runners(validate_workers_after_construction)", + new="AlgorithmConfig.env_runners(validate_env_runners_after_" + "construction)", + error=False, + ) + self.validate_env_runners_after_construction = ( + validate_workers_after_construction + ) + if env_runner_cls is not NotProvided: self.env_runner_cls = env_runner_cls - if num_rollout_workers is not NotProvided: - self.num_rollout_workers = num_rollout_workers - if num_envs_per_worker is not NotProvided: - if num_envs_per_worker <= 0: + if num_env_runners is not NotProvided: + self.num_env_runners = num_env_runners + if num_envs_per_env_runner is not NotProvided: + if num_envs_per_env_runner <= 0: raise ValueError( - f"`num_envs_per_worker` ({num_envs_per_worker}) must be larger 0!" + f"`num_envs_per_env_runner` ({num_envs_per_env_runner}) must be " + "larger 0!" ) - self.num_envs_per_worker = num_envs_per_worker + self.num_envs_per_env_runner = num_envs_per_env_runner if sample_timeout_s is not NotProvided: self.sample_timeout_s = sample_timeout_s if sample_collector is not NotProvided: @@ -1725,13 +1767,26 @@ def rollouts( "complete_episodes]!" ) self.batch_mode = batch_mode + if explore is not NotProvided: + self.explore = explore + if exploration_config is not NotProvided: + # Override entire `exploration_config` if `type` key changes. + # Update, if `type` key remains the same or is not specified. + new_exploration_config = deep_update( + {"exploration_config": self.exploration_config}, + {"exploration_config": exploration_config}, + False, + ["exploration_config"], + ["exploration_config"], + ) + self.exploration_config = new_exploration_config["exploration_config"] if remote_worker_envs is not NotProvided: self.remote_worker_envs = remote_worker_envs if remote_env_batch_wait_ms is not NotProvided: self.remote_env_batch_wait_ms = remote_env_batch_wait_ms - if validate_workers_after_construction is not NotProvided: - self.validate_workers_after_construction = ( - validate_workers_after_construction + if validate_env_runners_after_construction is not NotProvided: + self.validate_env_runners_after_construction = ( + validate_env_runners_after_construction ) if preprocessor_pref is not NotProvided: self.preprocessor_pref = preprocessor_pref @@ -1749,63 +1804,54 @@ def rollouts( # Deprecated settings. if synchronize_filter != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.rollouts(synchronize_filter=..)", - new="AlgorithmConfig.rollouts(update_worker_filter_stats=..)", - error=False, + old="AlgorithmConfig.env_runners(synchronize_filter=..)", + new="AlgorithmConfig.env_runners(update_worker_filter_stats=..)", + error=True, ) - self.update_worker_filter_stats = synchronize_filter if ignore_worker_failures != DEPRECATED_VALUE: deprecation_warning( old="ignore_worker_failures is deprecated, and will soon be a no-op", - error=False, + error=True, ) - self.ignore_worker_failures = ignore_worker_failures if recreate_failed_workers != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.rollouts(recreate_failed_workers=..)", + old="AlgorithmConfig.env_runners(recreate_failed_workers=..)", new="AlgorithmConfig.fault_tolerance(recreate_failed_workers=..)", - error=False, + error=True, ) - self.recreate_failed_workers = recreate_failed_workers if restart_failed_sub_environments != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.rollouts(restart_failed_sub_environments=..)", + old="AlgorithmConfig.env_runners(restart_failed_sub_environments=..)", new=( "AlgorithmConfig.fault_tolerance(" "restart_failed_sub_environments=..)" ), - error=False, + error=True, ) - self.restart_failed_sub_environments = restart_failed_sub_environments if num_consecutive_worker_failures_tolerance != DEPRECATED_VALUE: deprecation_warning( old=( - "AlgorithmConfig.rollouts(" + "AlgorithmConfig.env_runners(" "num_consecutive_worker_failures_tolerance=..)" ), new=( "AlgorithmConfig.fault_tolerance(" "num_consecutive_worker_failures_tolerance=..)" ), - error=False, - ) - self.num_consecutive_worker_failures_tolerance = ( - num_consecutive_worker_failures_tolerance + error=True, ) if worker_health_probe_timeout_s != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.rollouts(worker_health_probe_timeout_s=..)", + old="AlgorithmConfig.env_runners(worker_health_probe_timeout_s=..)", new="AlgorithmConfig.fault_tolerance(worker_health_probe_timeout_s=..)", - error=False, + error=True, ) - self.worker_health_probe_timeout_s = worker_health_probe_timeout_s if worker_restore_timeout_s != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.rollouts(worker_restore_timeout_s=..)", + old="AlgorithmConfig.env_runners(worker_restore_timeout_s=..)", new="AlgorithmConfig.fault_tolerance(worker_restore_timeout_s=..)", - error=False, + error=True, ) - self.worker_restore_timeout_s = worker_restore_timeout_s return self @@ -1826,8 +1872,6 @@ def training( Callable[["RLModule"], Union["ConnectorV2", List["ConnectorV2"]]] ] = NotProvided, add_default_connectors_to_learner_pipeline: Optional[bool] = NotProvided, - # Deprecated arg. - _enable_learner_api: Optional[bool] = NotProvided, ) -> "AlgorithmConfig": """Sets the training related configuration. @@ -1957,12 +2001,6 @@ def training( self.max_requests_in_flight_per_sampler_worker = ( max_requests_in_flight_per_sampler_worker ) - if _enable_learner_api is not NotProvided: - deprecation_warning( - old="AlgorithmConfig.training(_enable_learner_api=True|False)", - new="AlgorithmConfig.experimental(_enable_new_api_stack=True|False)", - error=True, - ) if learner_class is not NotProvided: self._learner_class = learner_class if learner_connector is not NotProvided: @@ -1999,41 +2037,6 @@ def callbacks(self, callbacks_class) -> "AlgorithmConfig": return self - # TODO (sven): Deprecate this method. Move `explore` setting into `rollouts()`. - # `exploration_config` should no longer be used on the new API stack. - def exploration( - self, - *, - explore: Optional[bool] = NotProvided, - exploration_config: Optional[dict] = NotProvided, - ) -> "AlgorithmConfig": - """Sets the config's exploration settings. - - Args: - explore: Default exploration behavior, iff `explore=None` is passed into - compute_action(s). Set to False for no exploration behavior (e.g., - for evaluation). - exploration_config: A dict specifying the Exploration object's config. - - Returns: - This updated AlgorithmConfig object. - """ - if explore is not NotProvided: - self.explore = explore - if exploration_config is not NotProvided: - # Override entire `exploration_config` if `type` key changes. - # Update, if `type` key remains the same or is not specified. - new_exploration_config = deep_update( - {"exploration_config": self.exploration_config}, - {"exploration_config": exploration_config}, - False, - ["exploration_config"], - ["exploration_config"], - ) - self.exploration_config = new_exploration_config["exploration_config"] - - return self - def evaluation( self, *, @@ -2048,13 +2051,12 @@ def evaluation( ] = NotProvided, off_policy_estimation_methods: Optional[Dict] = NotProvided, ope_split_batch_by_episode: Optional[bool] = NotProvided, - evaluation_num_workers: Optional[int] = NotProvided, + evaluation_num_env_runners: Optional[int] = NotProvided, custom_evaluation_function: Optional[Callable] = NotProvided, always_attach_evaluation_results: Optional[bool] = NotProvided, # Deprecated args. evaluation_num_episodes=DEPRECATED_VALUE, - enable_async_evaluation=DEPRECATED_VALUE, - custom_async_evaluation_function=DEPRECATED_VALUE, + evaluation_num_workers=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the config's evaluation settings. @@ -2066,8 +2068,8 @@ def evaluation( `evaluation_interval`. The unit for the duration can be set via `evaluation_duration_unit` to either "episodes" (default) or "timesteps". If using multiple evaluation workers (EnvRunners) in the - `evaluation_num_workers > 1` setting, the amount of episodes/timesteps - to run will be split amongst these. + `evaluation_num_env_runners > 1` setting, the amount of + episodes/timesteps to run will be split amongst these. A special value of "auto" can be used in case `evaluation_parallel_to_training=True`. This is the recommended way when trying to save as much time on evaluation as possible. The Algorithm @@ -2126,12 +2128,13 @@ def evaluation( case of bandits you should make this False to see improvements in ope evaluation speed. In case of bandits, it is ok to not split by episode, since each record is one timestep already. The default is True. - evaluation_num_workers: Number of parallel workers to use for evaluation. - Note that this is set to zero by default, which means evaluation will - be run in the algorithm process (only if evaluation_interval is not 0 or - None). If you increase this, it will increase the Ray resource usage of - the algorithm since evaluation workers are created separately from - rollout workers (used to sample data for training). + evaluation_num_env_runners: Number of parallel EnvRunners to use for + evaluation. Note that this is set to zero by default, which means + evaluation will be run in the algorithm process (only if + `evaluation_interval` is not 0 or None). If you increase this, it will + increase the Ray resource usage of the algorithm since evaluation + workers are created separately from those EnvRunners used to sample data + for training. custom_evaluation_function: Customize the evaluation method. This must be a function of signature (algo: Algorithm, eval_workers: WorkerSet) -> metrics: dict. See the Algorithm.evaluate() method to see the default @@ -2150,22 +2153,15 @@ def evaluation( old="AlgorithmConfig.evaluation(evaluation_num_episodes=..)", new="AlgorithmConfig.evaluation(evaluation_duration=.., " "evaluation_duration_unit='episodes')", - error=False, - ) - evaluation_duration = evaluation_num_episodes - elif enable_async_evaluation != DEPRECATED_VALUE: - deprecation_warning( - old="AlgorithmConfig.evaluation(enable_async_evaluation=...)", - new="AlgorithmConfig.evaluation(evaluation_parallel_to_training=...)", error=True, ) - elif custom_async_evaluation_function != DEPRECATED_VALUE: + if evaluation_num_workers != DEPRECATED_VALUE: deprecation_warning( - old="AlgorithmConfig.evaluation(custom_async_evaluation_function=...)", - new="AlgorithmConfig.evaluation(evaluation_parallel_to_training=True," - " custom_evaluation_function=...)", - error=True, + old="AlgorithmConfig.evaluation(evaluation_num_workers=..)", + new="AlgorithmConfig.evaluation(evaluation_num_env_runners=..)", + error=False, ) + self.evaluation_num_env_runners = evaluation_num_workers if evaluation_interval is not NotProvided: self.evaluation_interval = evaluation_interval @@ -2200,8 +2196,8 @@ def evaluation( ) if off_policy_estimation_methods is not NotProvided: self.off_policy_estimation_methods = off_policy_estimation_methods - if evaluation_num_workers is not NotProvided: - self.evaluation_num_workers = evaluation_num_workers + if evaluation_num_env_runners is not NotProvided: + self.evaluation_num_env_runners = evaluation_num_env_runners if custom_evaluation_function is not NotProvided: self.custom_evaluation_function = custom_evaluation_function if always_attach_evaluation_results is not NotProvided: @@ -2298,13 +2294,13 @@ def offline_data( raise ValueError( msg.format( "parallelism", - "config.evaluation(evaluation_num_workers=..)", + "config.evaluation(evaluation_num_env_runners=..)", ) ) else: raise ValueError( msg.format( - "parallelism", "config.rollouts(num_rollout_workers=..)" + "parallelism", "config.env_runners(num_env_runners=..)" ) ) self.input_config = input_config @@ -2759,7 +2755,6 @@ def fault_tolerance( return self - @ExperimentalAPI def rl_module( self, *, @@ -2961,7 +2956,7 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: Uses the simple formula: `rollout_fragment_length` = `total_train_batch_size` / - (`num_envs_per_worker` * `num_rollout_workers`) + (`num_envs_per_worker` * `num_env_runners`) If result is a fraction AND `worker_index` is provided, will make those workers add additional timesteps, such that the overall batch size (across @@ -2983,12 +2978,12 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: # -> 512 / 40 -> 12.8 -> diff=32 (12 * 40 = 480) # -> worker 1: 13, workers 2: 12 rollout_fragment_length = self.total_train_batch_size / ( - self.num_envs_per_worker * (self.num_rollout_workers or 1) + self.num_envs_per_worker * (self.num_env_runners or 1) ) if int(rollout_fragment_length) != rollout_fragment_length: diff = self.total_train_batch_size - int( rollout_fragment_length - ) * self.num_envs_per_worker * (self.num_rollout_workers or 1) + ) * self.num_envs_per_worker * (self.num_env_runners or 1) if ((worker_index - 1) * self.num_envs_per_worker) >= diff: return int(rollout_fragment_length) else: @@ -3058,7 +3053,7 @@ def get_evaluation_config_object( if self.evaluation_duration == "auto" else int( math.ceil( - self.evaluation_duration / (self.evaluation_num_workers or 1) + self.evaluation_duration / (self.evaluation_num_env_runners or 1) ) ) ) @@ -3325,7 +3320,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: dependent on rollout_fragment_length (synchronous sampling, on-policy PG algos). If rollout_fragment_length != "auto", makes sure that the product of - `rollout_fragment_length` x `num_rollout_workers` x `num_envs_per_worker` + `rollout_fragment_length` x `num_env_runners` x `num_envs_per_worker` roughly (10%) matches the provided `train_batch_size`. Otherwise, errors with asking the user to set rollout_fragment_length to `auto` or to a matching value. @@ -3343,7 +3338,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: and self.total_train_batch_size > 0 ): min_batch_size = ( - max(self.num_rollout_workers, 1) + max(self.num_env_runners, 1) * self.num_envs_per_worker * self.rollout_fragment_length ) @@ -3356,14 +3351,14 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: 0.1 * self.total_train_batch_size ): suggested_rollout_fragment_length = self.total_train_batch_size // ( - self.num_envs_per_worker * (self.num_rollout_workers or 1) + self.num_envs_per_worker * (self.num_env_runners or 1) ) raise ValueError( "Your desired `total_train_batch_size` " f"({self.total_train_batch_size}={self.num_learner_workers} " f"learners x {self.train_batch_size_per_learner}) " "or a value 10% off of that cannot be achieved with your other " - f"settings (num_rollout_workers={self.num_rollout_workers}; " + f"settings (num_env_runners={self.num_env_runners}; " f"num_envs_per_worker={self.num_envs_per_worker}; " f"rollout_fragment_length={self.rollout_fragment_length})! " "Try setting `rollout_fragment_length` to 'auto' OR to a value of " @@ -3884,25 +3879,25 @@ def _validate_evaluation_settings(self): "instead." ) - # If `evaluation_num_workers` > 0, warn if `evaluation_interval` is 0 or + # If `evaluation_num_env_runners` > 0, warn if `evaluation_interval` is 0 or # None. - if self.evaluation_num_workers > 0 and not self.evaluation_interval: + if self.evaluation_num_env_runners > 0 and not self.evaluation_interval: logger.warning( - f"You have specified {self.evaluation_num_workers} " + f"You have specified {self.evaluation_num_env_runners} " "evaluation workers, but your `evaluation_interval` is 0 or None! " "Therefore, evaluation will not occur automatically with each" " call to `Algorithm.train()`. Instead, you will have to call " "`Algorithm.evaluate()` manually in order to trigger an " "evaluation run." ) - # If `evaluation_num_workers=0` and + # If `evaluation_num_env_runners=0` and # `evaluation_parallel_to_training=True`, warn that you need # at least one remote eval worker for parallel training and # evaluation, and set `evaluation_parallel_to_training` to False. - if self.evaluation_num_workers == 0 and self.evaluation_parallel_to_training: + if self.evaluation_num_env_runners == 0 and self.evaluation_parallel_to_training: raise ValueError( "`evaluation_parallel_to_training` can only be done if " - "`evaluation_num_workers` > 0! Try setting " + "`evaluation_num_env_runners` > 0! Try setting " "`config.evaluation_parallel_to_training` to False." ) @@ -3948,15 +3943,15 @@ def _validate_input_settings(self): self.input_config["num_cpus_per_read_task"] = self.num_cpus_per_worker if self.in_evaluation: # If using dataset for evaluation, the parallelism gets set to - # evaluation_num_workers for backward compatibility and num_cpus gets - # set to num_cpus_per_worker from rollout worker. User only needs to - # set evaluation_num_workers. - self.input_config["parallelism"] = self.evaluation_num_workers or 1 + # evaluation_num_env_runners for backward compatibility and num_cpus + # gets set to num_cpus_per_worker from rollout worker. User only needs + # to set evaluation_num_env_runners. + self.input_config["parallelism"] = self.evaluation_num_env_runners or 1 else: # If using dataset for training, the parallelism and num_cpus gets set # based on rollout worker parameters. This is for backwards - # compatibility for now. User only needs to set num_rollout_workers. - self.input_config["parallelism"] = self.num_rollout_workers or 1 + # compatibility for now. User only needs to set num_env_runners. + self.input_config["parallelism"] = self.num_env_runners or 1 def _validate_new_api_stack_settings(self): """Checks, whether settings related to the new API stack make sense.""" @@ -3997,7 +3992,7 @@ def _validate_new_api_stack_settings(self): raise ValueError( "The new API stack (RLModule and Learner APIs) only works with " "connectors! Please enable connectors via " - "`config.rollouts(enable_connectors=True)`." + "`config.env_runners(enable_connectors=True)`." ) # LR-schedule checking. @@ -4238,7 +4233,7 @@ def _translate_special_keys(key: str, warn_deprecated: bool = True) -> str: elif key == "num_cpus_for_driver": key = "num_cpus_for_local_worker" elif key == "num_workers": - key = "num_rollout_workers" + key = "num_env_runners" # Deprecated keys. if warn_deprecated: @@ -4347,19 +4342,13 @@ def _resolve_tf_settings(self, _tf1, _tfv): "speed as with static-graph mode." ) - @property - @Deprecated( - old="AlgorithmConfig.multiagent['[some key]']", - new="AlgorithmConfig.[some key]", - error=True, - ) - def multiagent(self): - pass + @Deprecated(new="AlgorithmConfig.env_runners(..)", error=False) + def rollouts(self, *args, **kwargs): + return self.env_runners(*args, **kwargs) - @property - @Deprecated(new="AlgorithmConfig.rollouts(num_rollout_workers=..)", error=True) - def num_workers(self): - pass + @Deprecated(new="AlgorithmConfig.env_runners(..)", error=False) + def exploration(self, *args, **kwargs): + return self.env_runners(*args, **kwargs) class TorchCompileWhatToCompile(str, Enum): diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 98d3a3c22400..4f5caa1259f9 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -46,7 +46,7 @@ class APPOConfig(ImpalaConfig): from ray.rllib.algorithms.appo import APPOConfig config = APPOConfig().training(lr=0.01, grad_clip=30.0, train_batch_size=50) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) config = config.environment("CartPole-v1") # Build an Algorithm object from the config and run 1 training iteration. @@ -99,7 +99,7 @@ def __init__(self, algo_class=None): self.kl_target = 0.01 # Override some of ImpalaConfig's default values with APPO-specific values. - self.num_rollout_workers = 2 + self.num_env_runners = 2 self.rollout_fragment_length = 50 self.train_batch_size = 500 self.min_time_s_per_iteration = 10 diff --git a/rllib/algorithms/appo/tests/test_appo.py b/rllib/algorithms/appo/tests/test_appo.py index 5bd84f22efe6..2a7b211a5396 100644 --- a/rllib/algorithms/appo/tests/test_appo.py +++ b/rllib/algorithms/appo/tests/test_appo.py @@ -22,7 +22,7 @@ def tearDownClass(cls): def test_appo_compilation(self): """Test whether APPO can be built with both frameworks.""" - config = appo.APPOConfig().rollouts(num_rollout_workers=1) + config = appo.APPOConfig().env_runners(num_env_runners=1) num_iterations = 2 for _ in framework_iterator(config): @@ -38,7 +38,7 @@ def test_appo_compilation(self): def test_appo_compilation_use_kl_loss(self): """Test whether APPO can be built with kl_loss enabled.""" config = ( - appo.APPOConfig().rollouts(num_rollout_workers=1).training(use_kl_loss=True) + appo.APPOConfig().env_runners(num_env_runners=1).training(use_kl_loss=True) ) num_iterations = 2 @@ -56,7 +56,7 @@ def test_appo_two_optimizers_two_lrs(self): # config["_tf_policy_handles_more_than_one_loss"] = True config = ( appo.APPOConfig() - .rollouts(num_rollout_workers=1) + .env_runners(num_env_runners=1) .training( _separate_vf_optimizer=True, _lr_vf=0.002, @@ -84,8 +84,8 @@ def test_appo_entropy_coeff_schedule(self): # Initial lr, doesn't really matter because of the schedule below. config = ( appo.APPOConfig() - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, batch_mode="truncate_episodes", rollout_fragment_length=10, ) @@ -139,8 +139,8 @@ def _step_n_times(algo, n: int): def test_appo_learning_rate_schedule(self): config = ( appo.APPOConfig() - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, batch_mode="truncate_episodes", rollout_fragment_length=10, ) @@ -186,8 +186,8 @@ def _step_n_times(algo, n: int): def test_appo_model_variables(self): config = ( appo.APPOConfig() - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, batch_mode="truncate_episodes", rollout_fragment_length=10, ) diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index 9c4bf60daab6..c6f1b4d96307 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -58,8 +58,8 @@ def test_appo_loss(self): appo.APPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=frag_length, ) .resources(num_gpus=0) @@ -73,7 +73,7 @@ def test_appo_loss(self): ) ) # We have to set exploration_config here manually because setting it through - # config.exploration() only deep-updates it + # config.env_runners() only deep-updates it config.exploration_config = {} for fw in framework_iterator(config, frameworks=("torch", "tf2")): @@ -108,9 +108,10 @@ def test_kl_coeff_changes(self): .environment("CartPole-v1") # Asynchronous Algo, make sure we have some results after 1 iteration. .reporting(min_time_s_per_iteration=10) - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=frag_length, + exploration_config={}, ) .resources(num_gpus=0) .training( @@ -123,7 +124,6 @@ def test_kl_coeff_changes(self): use_kl_loss=True, kl_coeff=initial_kl_coeff, ) - .exploration(exploration_config={}) ) for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() diff --git a/rllib/algorithms/appo/tests/test_appo_off_policyness.py b/rllib/algorithms/appo/tests/test_appo_off_policyness.py index a52d097d0315..b5df78ba809d 100644 --- a/rllib/algorithms/appo/tests/test_appo_off_policyness.py +++ b/rllib/algorithms/appo/tests/test_appo_off_policyness.py @@ -26,7 +26,7 @@ def test_appo_off_policyness(self): appo.APPOConfig() .environment("CartPole-v1") .resources(num_gpus=1) - .rollouts(num_rollout_workers=4) + .env_runners(num_env_runners=4) ) num_iterations = 3 diff --git a/rllib/algorithms/bc/tests/test_bc.py b/rllib/algorithms/bc/tests/test_bc.py index 99db57432066..89acd8102caa 100644 --- a/rllib/algorithms/bc/tests/test_bc.py +++ b/rllib/algorithms/bc/tests/test_bc.py @@ -24,7 +24,7 @@ def test_bc_compilation_and_learning_from_offline_file(self): """Test whether BC can be built with all frameworks. And learns from a historic-data file (while being evaluated on an - actual env using evaluation_num_workers > 0). + actual env using evaluation_num_env_runners > 0). """ rllib_dir = Path(__file__).parents[3] print("rllib_dir={}".format(rllib_dir)) @@ -36,7 +36,7 @@ def test_bc_compilation_and_learning_from_offline_file(self): bc.BCConfig() .evaluation( evaluation_interval=3, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_duration=5, evaluation_parallel_to_training=True, evaluation_config=bc.BCConfig.overrides(input_="sampler"), diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index 980889bb5340..ea719ad0d65c 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -49,7 +49,7 @@ class CQLConfig(SACConfig): from ray.rllib.algorithms.cql import CQLConfig config = CQLConfig().training(gamma=0.9, lr=0.01) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=4) + config = config.env_runners(num_env_runners=4) print(config.to_dict()) # Build a Algorithm object from the config and run 1 training iteration. algo = config.build(env="CartPole-v1") diff --git a/rllib/algorithms/cql/tests/test_cql.py b/rllib/algorithms/cql/tests/test_cql.py index a2e7ab78088c..fa01a83bcc3f 100644 --- a/rllib/algorithms/cql/tests/test_cql.py +++ b/rllib/algorithms/cql/tests/test_cql.py @@ -64,9 +64,9 @@ def test_cql_compilation(self): evaluation_duration=10, evaluation_config=cql.CQLConfig.overrides(input_="sampler"), evaluation_parallel_to_training=False, - evaluation_num_workers=2, + evaluation_num_env_runners=2, ) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .reporting(min_time_s_per_iteration=0) ) num_iterations = 4 diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index cc0976ce50b3..19908cfc04db 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -73,7 +73,7 @@ class DQNConfig(AlgorithmConfig): config = config.training(replay_buffer_config=replay_config) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) config = config.environment("CartPole-v1") algo = DQN(config=config) algo.train() @@ -107,20 +107,9 @@ def __init__(self, algo_class=None): super().__init__(algo_class=algo_class or DQN) # Overrides of AlgorithmConfig defaults - # `rollouts()` + # `env_runners()` # Set to `self.n_step`, if 'auto'. self.rollout_fragment_length = "auto" - - # `training()` - self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by - # global_norm, no matter the value of `grad_clip_by`. - self.grad_clip_by = "global_norm" - self.lr = 5e-4 - self.train_batch_size = 32 - - # `exploration()` self.exploration_config = { "type": "EpsilonGreedy", "initial_epsilon": 1.0, @@ -130,8 +119,18 @@ def __init__(self, algo_class=None): # New stack uses `epsilon` as either a constant value or a scheduler # defined like this. # TODO (simon): Ensure that users can understand how to provide epsilon. + # (sven): Should we add this to `self.env_runners(epsilon=..)`? self.epsilon = [(0, 1.0), (10000, 0.05)] + # `training()` + self.grad_clip = 40.0 + # Note: Only when using _enable_new_api_stack=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.lr = 5e-4 + self.train_batch_size = 32 + # `evaluation()` self.evaluation(evaluation_config=AlgorithmConfig.overrides(explore=False)) @@ -301,7 +300,7 @@ def training( collecting samples from the env). If None, uses "natural" values of: `train_batch_size` / (`rollout_fragment_length` x `num_workers` x - `num_envs_per_worker`). + `num_envs_per_env_runner`). If not None, will make sure that the ratio between timesteps inserted into and sampled from the buffer matches the given values. Example: @@ -309,7 +308,7 @@ def training( train_batch_size=250 rollout_fragment_length=1 num_workers=1 (or 0) - num_envs_per_worker=1 + num_envs_per_env_runner=1 -> natural value = 250 / 1 = 250.0 -> will make sure that replay+train op will be executed 4x asoften as rollout+insert op (4 * 250 = 1000). @@ -399,7 +398,7 @@ def validate(self) -> None: if self.batch_mode != "complete_episodes": raise ValueError( "ParameterNoise Exploration requires `batch_mode` to be " - "'complete_episodes'. Try setting `config.rollouts(" + "'complete_episodes'. Try setting `config.env_runners(" "batch_mode='complete_episodes')`." ) @@ -418,7 +417,7 @@ def validate(self) -> None: raise ValueError( f"Your `rollout_fragment_length` ({self.rollout_fragment_length}) is " f"smaller than `n_step` ({self.n_step})! " - f"Try setting config.rollouts(rollout_fragment_length={self.n_step})." + f"Try setting config.env_runners(rollout_fragment_length={self.n_step})." ) # TODO (simon): Find a clean solution to deal with @@ -430,7 +429,7 @@ def validate(self) -> None: if self.batch_mode != "complete_episodes": raise ValueError( "ParameterNoise Exploration requires `batch_mode` to be " - "'complete_episodes'. Try setting `config.rollouts(" + "'complete_episodes'. Try setting `config.env_runners(" "batch_mode='complete_episodes')`." ) if self.noisy: @@ -515,7 +514,7 @@ def get_default_learner_class(self) -> Union[Type["Learner"], str]: def calculate_rr_weights(config: AlgorithmConfig) -> List[float]: """Calculate the round robin weights for the rollout and train steps""" - if not config["training_intensity"]: + if not config.training_intensity: return [1, 1] # Calculate the "native ratio" as: @@ -523,17 +522,17 @@ def calculate_rr_weights(config: AlgorithmConfig) -> List[float]: # This is to set freshly rollout-collected data in relation to # the data we pull from the replay buffer (which also contains old # samples). - native_ratio = config["train_batch_size"] / ( + native_ratio = config.train_batch_size / ( config.get_rollout_fragment_length() - * config["num_envs_per_worker"] + * config.num_envs_per_env_runner # Add one to workers because the local # worker usually collects experiences as well, and we avoid division by zero. - * max(config["num_workers"] + 1, 1) + * max(config.num_env_runners + 1, 1) ) # Training intensity is specified in terms of # (steps_replayed / steps_sampled), so adjust for the native ratio. - sample_and_train_weight = config["training_intensity"] / native_ratio + sample_and_train_weight = config.training_intensity / native_ratio if sample_and_train_weight < 1: return [int(np.round(1 / sample_and_train_weight)), 1] else: diff --git a/rllib/algorithms/dqn/tests/test_dqn.py b/rllib/algorithms/dqn/tests/test_dqn.py index 83688f3c38af..b2472d24e03f 100644 --- a/rllib/algorithms/dqn/tests/test_dqn.py +++ b/rllib/algorithms/dqn/tests/test_dqn.py @@ -27,7 +27,7 @@ def test_dqn_compilation(self): config = ( dqn.dqn.DQNConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .training(num_steps_sampled_before_learning_starts=0) ) @@ -66,7 +66,7 @@ def test_dqn_compilation_integer_rewards(self): config = ( dqn.dqn.DQNConfig() .environment("Taxi-v3") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .training(num_steps_sampled_before_learning_starts=0) ) @@ -102,7 +102,7 @@ def test_dqn_exploration_and_soft_q_config(self): config = ( dqn.dqn.DQNConfig() .environment("FrozenLake-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .environment(env_config={"is_slippery": False, "map_name": "4x4"}) ).training(num_steps_sampled_before_learning_starts=0) @@ -126,7 +126,7 @@ def test_dqn_exploration_and_soft_q_config(self): # Low softmax temperature. Behaves like argmax # (but no epsilon exploration). - config.exploration( + config.env_runners( exploration_config={"type": "SoftQ", "temperature": 0.000001} ) algo = config.build() @@ -157,7 +157,7 @@ def test_dqn_exploration_and_soft_q_config(self): algo.stop() # With Random exploration. - config.exploration(exploration_config={"type": "Random"}, explore=True) + config.env_runners(exploration_config={"type": "Random"}, explore=True) algo = config.build() actions = [] for _ in range(300): diff --git a/rllib/algorithms/dreamerv3/README.md b/rllib/algorithms/dreamerv3/README.md index 4771fa7f826e..149c4f898635 100644 --- a/rllib/algorithms/dreamerv3/README.md +++ b/rllib/algorithms/dreamerv3/README.md @@ -155,7 +155,7 @@ adjustments should be made on top of the default config. Use the `DreamerV3Config.training(batch_size_B=..)` API for this. For example, for 2 GPUs, use a batch size of `B=32`. - Multiply the number of environments you sample from in parallel by the number of GPUs you are using. - Use the `DreamerV3Config.rollouts(num_envs_per_worker=..)` for this. + Use the `DreamerV3Config.env_runners(num_envs_per_env_runner=..)` for this. For example, for 4 GPUs and a default environment count of 8 (the single-GPU default for this setting depends on the benchmark you are running), use 32 parallel environments instead. diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py index cfe0508b6c2c..437d35df82a1 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -142,7 +142,7 @@ def __init__(self, algo_class=None): # Do not use! Set `batch_size_B` and `batch_length_T` instead. self.train_batch_size = None self.env_runner_cls = DreamerV3EnvRunner - self.num_rollout_workers = 0 + self.num_env_runners = 0 self.rollout_fragment_length = 1 # Since we are using a gymnasium-based EnvRunner, we can utilitze its # vectorization capabilities w/o suffering performance losses (as we would @@ -446,9 +446,9 @@ def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: @property def share_module_between_env_runner_and_learner(self) -> bool: # If we only have one local Learner (num_learner_workers=0) and only - # one local EnvRunner (num_rollout_workers=0), share the RLModule + # one local EnvRunner (num_env_runners=0), share the RLModule # between these two to avoid having to sync weights, ever. - return self.num_learner_workers == 0 and self.num_rollout_workers == 0 + return self.num_learner_workers == 0 and self.num_env_runners == 0 @property @override(AlgorithmConfig) diff --git a/rllib/algorithms/dreamerv3/utils/env_runner.py b/rllib/algorithms/dreamerv3/utils/env_runner.py index 9ccb19d26353..45907986276c 100644 --- a/rllib/algorithms/dreamerv3/utils/env_runner.py +++ b/rllib/algorithms/dreamerv3/utils/env_runner.py @@ -82,7 +82,7 @@ def __init__( "GymV26Environment-v0", env_id=self.config.env, wrappers=wrappers, - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, asynchronous=self.config.remote_worker_envs, make_kwargs=dict( self.config.env_config, **{"render_mode": "rgb_array"} @@ -104,7 +104,7 @@ def __init__( self.env = gym.vector.make( "dmc_env-v0", wrappers=[ActionClip], - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, asynchronous=self.config.remote_worker_envs, **dict(self.config.env_config), ) @@ -127,11 +127,11 @@ def __init__( # Create the vectorized gymnasium env. self.env = gym.vector.make( "dreamerv3-custom-env-v0", - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, asynchronous=False, # self.config.remote_worker_envs, ) self.num_envs = self.env.num_envs - assert self.num_envs == self.config.num_envs_per_worker + assert self.num_envs == self.config.num_envs_per_env_runner # Create our RLModule to compute actions with. policy_dict, _ = self.config.get_multi_agent_setup(env=self.env) diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 232d5f355a46..41962010cf2e 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -71,7 +71,7 @@ class ImpalaConfig(AlgorithmConfig): config = ImpalaConfig() config = config.training(lr=0.0003, train_batch_size=512) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) # Build a Algorithm object from the config and run 1 training iteration. algo = config.build(env="CartPole-v1") algo.train() @@ -89,7 +89,7 @@ class ImpalaConfig(AlgorithmConfig): lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0 ) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) # Set the config object's env. config = config.environment(env="CartPole-v1") # Run with tune. @@ -151,7 +151,7 @@ def __init__(self, algo_class=None): self.rollout_fragment_length = 50 self.train_batch_size = 500 self._minibatch_size = "auto" - self.num_rollout_workers = 2 + self.num_env_runners = 2 self.num_gpus = 1 self.lr = 0.0005 self.min_time_s_per_iteration = 10 @@ -402,12 +402,12 @@ def validate(self) -> None: raise ValueError("`entropy_coeff` must be >= 0.0") # Check whether worker to aggregation-worker ratio makes sense. - if self.num_aggregation_workers > self.num_rollout_workers: + if self.num_aggregation_workers > self.num_env_runners: raise ValueError( "`num_aggregation_workers` must be smaller than or equal " - "`num_rollout_workers`! Aggregation makes no sense otherwise." + "`num_env_runners`! Aggregation makes no sense otherwise." ) - elif self.num_aggregation_workers > self.num_rollout_workers / 2: + elif self.num_aggregation_workers > self.num_env_runners / 2: logger.warning( "`num_aggregation_workers` should be significantly smaller " "than `num_workers`! Try setting it to 0.5*`num_workers` or " @@ -803,7 +803,7 @@ def default_resource_request( "GPU": cf.num_gpus_per_worker, **cf.custom_resources_per_worker, } - for _ in range(cf.num_rollout_workers) + for _ in range(cf.num_env_runners) ] + ( [ @@ -815,7 +815,7 @@ def default_resource_request( "GPU": eval_config.num_gpus_per_worker, **eval_config.custom_resources_per_worker, } - for _ in range(cf.evaluation_num_workers) + for _ in range(cf.evaluation_num_env_runners) ] if cf.evaluation_interval else [] @@ -1150,7 +1150,7 @@ def update_workers_from_learner_group( self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] = 0 self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 weights = self.learner_group.get_weights(policy_ids) - if self.config.num_rollout_workers == 0: + if self.config.num_env_runners == 0: worker = self.workers.local_worker() worker.set_weights(weights) else: diff --git a/rllib/algorithms/impala/tests/test_impala.py b/rllib/algorithms/impala/tests/test_impala.py index dc79a3ef5ee0..cf7fe4d51eec 100644 --- a/rllib/algorithms/impala/tests/test_impala.py +++ b/rllib/algorithms/impala/tests/test_impala.py @@ -30,7 +30,7 @@ def test_impala_compilation(self): impala.ImpalaConfig() .environment("CartPole-v1") .resources(num_gpus=0) - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .training( model={ "lstm_use_prev_action": True, @@ -78,7 +78,7 @@ def test_impala_lr_schedule(self): ], train_batch_size=100, ) - .rollouts(num_envs_per_worker=2) + .env_runners(num_envs_per_env_runner=2) .environment(env="CartPole-v1") ) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 59159f7ff4f7..b4b90cb8305a 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -60,8 +60,8 @@ def test_impala_loss(self): ImpalaConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=frag_length, ) .resources(num_gpus=0) diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index 282001ff3c78..0cf8ec62875c 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -26,7 +26,7 @@ def test_impala_off_policyness(self): .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") .resources(num_gpus=0) - .rollouts(num_rollout_workers=4) + .env_runners(num_env_runners=4) ) num_iterations = 3 num_aggregation_workers_options = [0, 1] @@ -35,7 +35,7 @@ def test_impala_off_policyness(self): for _ in framework_iterator(config, frameworks=("tf2", "torch")): # We have to set exploration_config here manually because setting - # it through config.exploration() only deepupdates it + # it through config.env_runners() only deepupdates it config.exploration_config = {} config.num_aggregation_workers = num_aggregation_workers print("aggregation-workers={}".format(config.num_aggregation_workers)) diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py index ca57944c1577..70d06cfadf16 100644 --- a/rllib/algorithms/marwil/tests/test_marwil.py +++ b/rllib/algorithms/marwil/tests/test_marwil.py @@ -46,11 +46,11 @@ def test_marwil_compilation_and_learning_from_offline_file(self): config = ( marwil.MARWILConfig() - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .environment(env="CartPole-v1") .evaluation( evaluation_interval=3, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_duration=5, evaluation_parallel_to_training=True, evaluation_config=marwil.MARWILConfig.overrides(input_="sampler"), @@ -109,9 +109,9 @@ def test_marwil_cont_actions_from_offline_file(self): config = ( marwil.MARWILConfig() - .rollouts(num_rollout_workers=1) + .env_runners(num_env_runners=1) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=3, evaluation_duration=5, evaluation_parallel_to_training=True, @@ -148,7 +148,7 @@ def test_marwil_loss_function(self): config = ( marwil.MARWILConfig() - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .offline_data(input_=[data_file]) ) # Learn from offline data. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 8d5157c7bc4b..0d4741625b72 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -63,7 +63,7 @@ class PPOConfig(AlgorithmConfig): config = config.training(gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size=128) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) # Build a Algorithm object from the config and run 1 training iteration. algo = config.build(env="CartPole-v1") @@ -129,7 +129,7 @@ def __init__(self, algo_class=None): self.grad_clip = None # Override some of AlgorithmConfig's default values with PPO-specific values. - self.num_rollout_workers = 2 + self.num_env_runners = 2 self.model["vf_share_layers"] = False # __sphinx_doc_end__ # fmt: on diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 12cbadc51e33..8541995302e0 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -146,8 +146,8 @@ def test_ppo_compilation_w_connectors(self): max_seq_len=20, ), ) - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, # Test with compression. compress_observations=True, enable_connectors=True, @@ -156,7 +156,7 @@ def test_ppo_compilation_w_connectors(self): .evaluation( evaluation_duration=2, evaluation_duration_unit="episodes", - evaluation_num_workers=1, + evaluation_num_env_runners=1, ) ) # For checking lr-schedule correctness. @@ -221,8 +221,8 @@ def test_ppo_compilation_and_schedule_mixins(self): max_seq_len=20, ), ) - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, # Test with compression. compress_observations=True, ) @@ -279,9 +279,9 @@ def test_ppo_exploration_setup(self): .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, - ).rollouts( + ).env_runners( # Run locally. - num_rollout_workers=0, + num_env_runners=0, ) ) obs = np.array(0) @@ -329,8 +329,8 @@ def test_ppo_free_log_std(self): # Learner API stack. .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, ) .training( gamma=0.99, @@ -394,8 +394,8 @@ def test_ppo_loss_function(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, ) .training( gamma=0.99, diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 3dd64bcd4d67..2d897caca974 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -58,8 +58,8 @@ def test_loss(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, ) .training( gamma=0.99, @@ -106,8 +106,8 @@ def test_save_load_state(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, ) .training( gamma=0.99, @@ -144,9 +144,10 @@ def test_kl_coeff_changes(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=50, + exploration_config={}, ) .training( gamma=0.99, @@ -157,7 +158,6 @@ def test_kl_coeff_changes(self): ), kl_coeff=initial_kl_coeff, ) - .exploration(exploration_config={}) .environment("multi_agent_cartpole") .multi_agent( policies={"p0", "p1"}, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index a7f04872da07..bc318db78bff 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -76,7 +76,7 @@ def test_ppo_compilation_and_schedule_mixins(self): ppo.PPOConfig() # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, ) @@ -92,7 +92,7 @@ def test_ppo_compilation_and_schedule_mixins(self): .callbacks(MyCallbacks) .evaluation( # Also test evaluation with remote workers. - evaluation_num_workers=2, + evaluation_num_env_runners=2, evaluation_duration=3, evaluation_duration_unit="episodes", evaluation_parallel_to_training=True, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 56f76f837e1e..724f64374ed2 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -88,8 +88,8 @@ def test_ppo_compilation_and_schedule_mixins(self): entropy_coeff=[[0, 0.1], [256, 0.0]], # 256=2x128, train_batch_size=128, ) - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, # Test with compression. # compress_observations=True, enable_connectors=True, @@ -142,9 +142,9 @@ def test_ppo_exploration_setup(self): "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, ) - .rollouts( + .env_runners( # Run locally. - num_rollout_workers=0, + num_env_runners=0, ) ) obs = np.array(0) @@ -183,8 +183,8 @@ def test_ppo_free_log_std_with_rl_modules(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=True) .environment("Pendulum-v1") - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, ) .rl_module( model_config_dict={ diff --git a/rllib/algorithms/ppo/tests/test_repro_ppo.py b/rllib/algorithms/ppo/tests/test_repro_ppo.py index 0cbbed585a04..7d0fdcfaef2f 100644 --- a/rllib/algorithms/ppo/tests/test_repro_ppo.py +++ b/rllib/algorithms/ppo/tests/test_repro_ppo.py @@ -28,7 +28,7 @@ def test_reproducibility_ppo_cartpole(self): configs = ( ppo.PPOConfig() .environment(env="DeterministicCartPole-v1", env_config={"seed": 42}) - .rollouts(rollout_fragment_length=8) + .env_runners(rollout_fragment_length=8) .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2) ) check_reproducibilty( @@ -46,7 +46,7 @@ def test_reproducibility_ppo_pendulum(self): configs = ( ppo.PPOConfig() .environment(env="DeterministicPendulum-v1", env_config={"seed": 42}) - .rollouts(rollout_fragment_length=8) + .env_runners(rollout_fragment_length=8) .training(train_batch_size=64, sgd_minibatch_size=32, num_sgd_iter=2) ) check_reproducibilty( diff --git a/rllib/algorithms/sac/rnnsac.py b/rllib/algorithms/sac/rnnsac.py index 176e389f4aef..7936e45242a5 100644 --- a/rllib/algorithms/sac/rnnsac.py +++ b/rllib/algorithms/sac/rnnsac.py @@ -17,7 +17,7 @@ class RNNSACConfig(SACConfig): Example: >>> config = RNNSACConfig().training(gamma=0.9, lr=0.01)\ ... .resources(num_gpus=0)\ - ... .rollouts(num_rollout_workers=4) + ... .env_runners(num_env_runners=4) >>> print(config.to_dict()) # doctest: +SKIP >>> # Build a Algorithm object from the config and run 1 training iteration. >>> algo = config.build(env="CartPole-v1") diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index b38fee812f83..87fd6830f36e 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -46,7 +46,7 @@ class SACConfig(AlgorithmConfig): config = SACConfig().training(gamma=0.9, lr=0.01, train_batch_size=32) config = config.resources(num_gpus=0) - config = config.rollouts(num_rollout_workers=1) + config = config.env_runners(num_env_runners=1) # Build a Algorithm object from the config and run 1 training iteration. algo = config.build(env="CartPole-v1") @@ -102,7 +102,9 @@ def __init__(self, algo_class=None): self.grad_clip = None self.target_network_update_freq = 0 - # .exploration() + # .env_runners() + self.rollout_fragment_length = "auto" + self.compress_observations = False self.exploration_config = { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. @@ -113,10 +115,6 @@ def __init__(self, algo_class=None): # Add constructor kwargs here (if any). } - # .rollout() - self.rollout_fragment_length = "auto" - self.compress_observations = False - # .training() self.train_batch_size = 256 # Number of timesteps to collect from rollout workers before we start diff --git a/rllib/algorithms/sac/tests/test_rnnsac.py b/rllib/algorithms/sac/tests/test_rnnsac.py index 78f188365461..e0fec4b5d7ee 100644 --- a/rllib/algorithms/sac/tests/test_rnnsac.py +++ b/rllib/algorithms/sac/tests/test_rnnsac.py @@ -23,7 +23,7 @@ def test_rnnsac_compilation(self): config = ( sac.RNNSACConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( # Wrap with an LSTM and use a very simple base-model. model={"max_seq_len": 20}, diff --git a/rllib/algorithms/sac/tests/test_sac.py b/rllib/algorithms/sac/tests/test_sac.py index dc3491a1e95f..1425fb6d93e7 100644 --- a/rllib/algorithms/sac/tests/test_sac.py +++ b/rllib/algorithms/sac/tests/test_sac.py @@ -82,7 +82,7 @@ def test_sac_compilation(self): store_buffer_in_checkpoints=True, train_batch_size=10, ) - .rollouts(num_rollout_workers=0, rollout_fragment_length=10) + .env_runners(num_env_runners=0, rollout_fragment_length=10) ) num_iterations = 1 @@ -202,8 +202,8 @@ def step(self, action): num_steps_sampled_before_learning_starts=0, train_batch_size=5, ) - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=5, ) .experimental(_disable_preprocessor_api=True) diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index 730f01dbb15a..6050bff3b9dc 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -57,7 +57,7 @@ def test_add_delete_policy(self): policy_map_capacity=2, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_config=ppo.PPOConfig.overrides(num_cpus_per_worker=0.1), ) ) @@ -343,7 +343,7 @@ def test_space_inference_from_remote_workers(self): config = ( ppo.PPOConfig() - .rollouts(num_rollout_workers=1, validate_workers_after_construction=False) + .env_runners(num_env_runners=1, validate_env_runners_after_construction=False) .environment(env="CartPole-v1") ) @@ -380,20 +380,20 @@ def test_space_inference_from_remote_workers(self): algo.stop() def test_worker_validation_time(self): - """Tests the time taken by `validate_workers_after_construction=True`.""" + """Tests the time taken by `validate_env_runners_after_construction=True`.""" config = ppo.PPOConfig().environment(env="CartPole-v1") - config.validate_workers_after_construction = True + config.validate_env_runners_after_construction = True # Test, whether validating one worker takes just as long as validating # >> 1 workers. - config.num_rollout_workers = 1 + config.num_env_runners = 1 t0 = time.time() algo = config.build() total_time_1 = time.time() - t0 print(f"Validating w/ 1 worker: {total_time_1}sec") algo.stop() - config.num_rollout_workers = 5 + config.num_env_runners = 5 t0 = time.time() algo = config.build() total_time_5 = time.time() - t0 @@ -418,7 +418,7 @@ def test_no_env_but_eval_workers_do_have_env(self): ) .evaluation( evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_config=BCConfig.overrides( env="CartPole-v1", input_="sampler", diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index fb9e415a5c04..9d50086c9398 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -96,9 +96,9 @@ def test_rollout_fragment_length(self): """Tests the proper auto-computation of the `rollout_fragment_length`.""" config = ( AlgorithmConfig() - .rollouts( - num_rollout_workers=4, - num_envs_per_worker=3, + .env_runners( + num_env_runners=4, + num_envs_per_env_runner=3, rollout_fragment_length="auto", ) .training(train_batch_size=2456) @@ -113,9 +113,9 @@ def test_rollout_fragment_length(self): config = ( AlgorithmConfig() - .rollouts( - num_rollout_workers=3, - num_envs_per_worker=2, + .env_runners( + num_env_runners=3, + num_envs_per_env_runner=2, rollout_fragment_length="auto", ) .training(train_batch_size=4000) @@ -129,8 +129,8 @@ def test_rollout_fragment_length(self): config = ( AlgorithmConfig() - .rollouts( - num_rollout_workers=12, + .env_runners( + num_env_runners=12, rollout_fragment_length="auto", ) .training(train_batch_size=1342) @@ -174,7 +174,7 @@ def test_rl_module_api(self): .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") .framework("torch") - .rollouts(enable_connectors=True) + .env_runners(enable_connectors=True) ) self.assertEqual(config.rl_module_spec.module_class, PPOTorchRLModule) @@ -233,7 +233,7 @@ def test_learner_api(self): PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts(enable_connectors=True) + .env_runners(enable_connectors=True) .framework("tf2") ) diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py index 1f9851cc297b..f01b86151711 100644 --- a/rllib/algorithms/tests/test_callbacks_old_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_stack.py @@ -78,7 +78,7 @@ def test_episode_and_sample_callbacks(self): config = ( PPOConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) .callbacks(EpisodeAndSampleCallbacks) .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) ) @@ -99,7 +99,7 @@ def test_on_sub_environment_created(self): dqn.DQNConfig().environment("CartPole-v1") # Create 4 sub-environments per remote worker. # Create 2 remote workers. - .rollouts(num_envs_per_worker=4, num_rollout_workers=2) + .env_runners(num_envs_per_worker=4, num_rollout_workers=2) ) for callbacks in ( @@ -131,7 +131,7 @@ def test_on_sub_environment_created_with_remote_envs(self): config = ( dqn.DQNConfig() .environment("CartPole-v1") - .rollouts( + .env_runners( # Make each sub-environment a ray actor. remote_worker_envs=True, # Create 2 remote workers. @@ -179,13 +179,13 @@ def test_on_episode_created(self): "p_terminated": 0.0, }, ) - .rollouts(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_worker=2, num_rollout_workers=1) .callbacks(OnEpisodeCreatedCallback) ) # Test with and without Connectors. for connector in [True, False]: - config.rollouts(enable_connectors=connector) + config.env_runners(enable_connectors=connector) algo = config.build() algo.train() # Two sub-environments share 1000 steps in the first training iteration diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py index 5ef3a0b95a49..55d7fa31699b 100644 --- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py +++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py @@ -58,7 +58,7 @@ def test_on_workers_recreated_callback(self): APPOConfig() .environment("env") .callbacks(OnWorkersRecreatedCallbacks) - .rollouts(num_rollout_workers=3) + .env_runners(num_rollout_workers=3) .fault_tolerance( recreate_failed_workers=True, delay_between_worker_restarts_s=0, diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 2ac4e24ed85e..02b007eab2aa 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -74,7 +74,7 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( + .env_runners( num_rollout_workers=0, batch_mode="truncate_episodes", env_runner_cls=SingleAgentEnvRunner, @@ -117,7 +117,7 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( + .env_runners( batch_mode="complete_episodes", env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, @@ -159,7 +159,7 @@ def test_overriding_on_episode_created_throws_error_on_new_api_stack(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=SingleAgentEnvRunner) + .env_runners(env_runner_cls=SingleAgentEnvRunner) .callbacks(OnEpisodeCreatedCallback) ) self.assertRaises(ValueError, lambda: config.validate()) diff --git a/rllib/algorithms/tests/test_memory_leaks.py b/rllib/algorithms/tests/test_memory_leaks.py index 8e8669d15afd..38e002474495 100644 --- a/rllib/algorithms/tests/test_memory_leaks.py +++ b/rllib/algorithms/tests/test_memory_leaks.py @@ -30,7 +30,7 @@ def test_leaky_env(self): ) # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. - .rollouts(create_env_on_local_worker=True) + .env_runners(create_env_on_local_worker=True) ) algo = config.build() results = check_memory_leaks(algo, to_check={"env"}, repeats=15) @@ -47,7 +47,7 @@ def test_leaky_policy(self): .environment("CartPole-v1") # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. - .rollouts(create_env_on_local_worker=True) + .env_runners(create_env_on_local_worker=True) .multi_agent( policies={ "default_policy": PolicySpec( diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index dc0f46569ae5..6ccd1f6ef809 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -264,12 +264,12 @@ def tearDownClass(cls) -> None: def _do_test_failing_fatal(self, config, fail_eval=False): """Test raises real error when out of workers.""" - config.num_rollout_workers = 2 + config.num_env_runners = 2 config.env = "multi_agent_fault_env" if config.is_multi_agent() else "fault_env" # Make both worker idx=1 and 2 fail. config.env_config = {"bad_indices": [1, 2]} if fail_eval: - config.evaluation_num_workers = 2 + config.evaluation_num_env_runners = 2 config.evaluation_interval = 1 config.evaluation_config = { # Make eval worker (index 1) fail. @@ -285,7 +285,7 @@ def _do_test_failing_fatal(self, config, fail_eval=False): def _do_test_failing_ignore(self, config: AlgorithmConfig, fail_eval: bool = False): # Test fault handling - config.num_rollout_workers = 2 + config.num_env_runners = 2 config.ignore_worker_failures = True config.recreate_failed_workers = False config.env = "fault_env" @@ -296,7 +296,7 @@ def _do_test_failing_ignore(self, config: AlgorithmConfig, fail_eval: bool = Fal } ) if fail_eval: - config.evaluation_num_workers = 2 + config.evaluation_num_env_runners = 2 config.evaluation_interval = 1 config.evaluation_config = { "ignore_worker_failures": True, @@ -327,8 +327,8 @@ def _do_test_failing_recover(self, config, multi_agent=False): counter = Counter.options(name=COUNTER_NAME).remote() # Test raises real error when out of workers. - config.num_rollout_workers = 1 - config.evaluation_num_workers = 1 + config.num_env_runners = 1 + config.evaluation_num_env_runners = 1 config.evaluation_interval = 1 config.env = "fault_env" if not multi_agent else "multi_agent_fault_env" config.evaluation_config = AlgorithmConfig.overrides( @@ -390,7 +390,7 @@ def test_fatal_single_agent(self): self._do_test_failing_fatal( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, env_to_module_connector=lambda env: FlattenObservations(), ) @@ -401,7 +401,7 @@ def test_fatal_multi_agent(self): self._do_test_failing_fatal( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=MultiAgentEnvRunner) + .env_runners(env_runner_cls=MultiAgentEnvRunner) .multi_agent(policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0"), ) @@ -410,7 +410,7 @@ def test_fatal_multi_agent(self): # self._do_test_fault_ignore( # ImpalaConfig() # .experimental(_enable_new_api_stack=True) - # .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorker) + # .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) # .resources(num_gpus=0) # ) @@ -421,7 +421,7 @@ def test_sync_replay(self): .environment( env_config={"action_space": gym.spaces.Box(0, 1, (2,), np.float32)} ) - .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorker) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .reporting(min_sample_timesteps_per_iteration=1) .training(replay_buffer_config={"type": "EpisodeReplayBuffer"}) ) @@ -430,7 +430,7 @@ def test_multi_gpu(self): self._do_test_failing_ignore( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, ) .training( @@ -444,7 +444,7 @@ def test_sync_samples(self): self._do_test_failing_ignore( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorker) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(optimizer={}) ) @@ -453,7 +453,7 @@ def test_eval_workers_failing_ignore(self): self._do_test_failing_ignore( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorker) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(model={"fcnet_hiddens": [4]}), fail_eval=True, ) @@ -463,9 +463,9 @@ def test_eval_workers_parallel_to_training_failing_recover(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorker) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_duration="auto", ) @@ -483,7 +483,7 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) .multi_agent( policies={"main", "p0", "p1"}, policy_mapping_fn=( @@ -495,7 +495,7 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( ), ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_duration="auto", ) @@ -519,9 +519,9 @@ def test_workers_failing_recover(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, - num_rollout_workers=2, + num_env_runners=2, rollout_fragment_length=16, ) .training( @@ -573,9 +573,9 @@ def test_modules_are_restored_on_recovered_worker(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent, - num_rollout_workers=2, + num_env_runners=2, rollout_fragment_length=16, ) .training( @@ -594,7 +594,7 @@ def test_modules_are_restored_on_recovered_worker(self): }, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_config=PPOConfig.overrides( recreate_failed_workers=True, @@ -675,9 +675,9 @@ def test_eval_workers_failing_recover(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, - num_rollout_workers=2, + num_env_runners=2, rollout_fragment_length=16, ) .training( @@ -687,7 +687,7 @@ def test_eval_workers_failing_recover(self): ) .environment(env="fault_env") .evaluation( - evaluation_num_workers=2, + evaluation_num_env_runners=2, evaluation_interval=1, evaluation_config=PPOConfig.overrides( env_config={ @@ -744,9 +744,9 @@ def test_worker_failing_recover_with_hanging_workers(self): .training( replay_buffer_config={"type": "EpisodeReplayBuffer"}, ) - .rollouts( + .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, - num_rollout_workers=3, + num_env_runners=3, rollout_fragment_length=16, sample_timeout_s=5.0, ) @@ -817,7 +817,7 @@ def test_eval_workers_on_infinite_episodes(self): .environment(env=RandomEnv, env_config={"p_terminated": 0.0}) .training(train_batch_size_per_learner=200) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_sample_timeout_s=2.0, ) diff --git a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py index 021b56d74ccd..e1c4ea184228 100644 --- a/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py +++ b/rllib/benchmarks/torch_compile/run_ppo_with_inference_bm.py @@ -50,9 +50,9 @@ def main(pargs): grad_clip=100, grad_clip_by="global_norm", ) - .rollouts( - num_rollout_workers=1 if pargs.smoke_test else 64, - num_envs_per_worker=1, + .env_runners( + num_env_runners=1 if pargs.smoke_test else 64, + num_envs_per_env_runner=1, batch_mode="truncate_episodes", rollout_fragment_length="auto", create_env_on_local_worker=True, diff --git a/rllib/connectors/tests/test_agent.py b/rllib/connectors/tests/test_agent.py index e34a1be3cfe7..6deb2dc29077 100644 --- a/rllib/connectors/tests/test_agent.py +++ b/rllib/connectors/tests/test_agent.py @@ -465,7 +465,7 @@ def test_connector_pipline_with_view_requirement(self): PPOConfig() .framework("torch") .environment(env="CartPole-v1") - .rollouts(create_env_on_local_worker=True) + .env_runners(create_env_on_local_worker=True) ) env = gym.make("CartPole-v1") diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 120ef1dc6780..976a0d901b3e 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -113,7 +113,7 @@ def sample( `forward_inference()` method. If None (default), will use the `explore` boolean setting from `self.config` passed into this EnvRunner's constructor. You can change this setting in your config via - `config.exploration(explore=True|False)`. + `config.env_runners(explore=True|False)`. random_actions: If True, actions will be sampled randomly (from the action space of the environment). If False (default), actions or action distribution parameters are computed by the RLModule. diff --git a/rllib/env/policy_server_input.py b/rllib/env/policy_server_input.py index 5379543e8069..ba339b7d77a3 100644 --- a/rllib/env/policy_server_input.py +++ b/rllib/env/policy_server_input.py @@ -54,7 +54,7 @@ class PolicyServerInput(ThreadingMixIn, HTTPServer, InputReader): input_=lambda ioctx: PolicyServerInput(ioctx, addr, port) ) # Run just 1 server (in the Algorithm's WorkerSet). - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) ) algo = config.build() while True: @@ -86,13 +86,13 @@ def __init__( any Algorithm by configuring [AlgorithmConfig object] - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .offline_data(input_=lambda ioctx: PolicyServerInput(ioctx, addr, port)) - Note that by setting num_rollout_workers: 0, the algorithm will only create one + Note that by setting num_env_runners: 0, the algorithm will only create one rollout worker / PolicyServerInput. Clients can connect to the launched server using rllib.env.PolicyClient. You can increase the number of available - connections (ports) by setting num_rollout_workers to a larger number. The ports + connections (ports) by setting num_env_runners to a larger number. The ports used will then be `port` + the worker's index. Args: diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 5e8c486b47c6..9b198cd151b9 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -128,7 +128,7 @@ def sample( `forward_inference()` method. If None (default), will use the `explore` boolean setting from `self.config` passed into this EnvRunner's constructor. You can change this setting in your config via - `config.exploration(explore=True|False)`. + `config.env_runners(explore=True|False)`. random_actions: If True, actions will be sampled randomly (from the action space of the environment). If False (default), actions or action distribution parameters are computed by the RLModule. diff --git a/rllib/env/tests/test_multi_agent_env.py b/rllib/env/tests/test_multi_agent_env.py index e1f370e77bfa..aeb1a27f6d8b 100644 --- a/rllib/env/tests/test_multi_agent_env.py +++ b/rllib/env/tests/test_multi_agent_env.py @@ -496,7 +496,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): env_creator=lambda _: BasicMultiAgent(5), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts(rollout_fragment_length=50, num_rollout_workers=0) + .env_runners(rollout_fragment_length=50, num_env_runners=0) .multi_agent( policies={"p0", "p1"}, policy_mapping_fn=policy_mapping_fn, @@ -516,10 +516,10 @@ def test_multi_agent_sample_sync_remote(self): # to the new signature we are using (agent_id, episode, **kwargs), # but should not break this test. config=AlgorithmConfig() - .rollouts( + .env_runners( rollout_fragment_length=50, - num_rollout_workers=0, - num_envs_per_worker=4, + num_env_runners=0, + num_envs_per_env_runner=4, remote_worker_envs=True, remote_env_batch_wait_ms=99999999, ) @@ -538,10 +538,10 @@ def test_multi_agent_sample_async_remote(self): env_creator=lambda _: BasicMultiAgent(5), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( + .env_runners( rollout_fragment_length=50, - num_rollout_workers=0, - num_envs_per_worker=4, + num_env_runners=0, + num_envs_per_env_runner=4, remote_worker_envs=True, ) .multi_agent( @@ -559,9 +559,9 @@ def test_sample_from_early_done_env(self): env_creator=lambda _: EarlyDoneMultiAgent(), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( + .env_runners( rollout_fragment_length=1, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ) .multi_agent( @@ -591,7 +591,7 @@ def test_multi_agent_with_flex_agents(self): config = ( PPOConfig() .environment("flex_agents_multi_agent") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .framework("tf") .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) ) @@ -612,7 +612,7 @@ def test_multi_agent_with_sometimes_zero_agents_observing(self): config = ( PPOConfig() .environment("sometimes_zero_agents") - .rollouts(num_rollout_workers=0, enable_connectors=True) + .env_runners(num_env_runners=0, enable_connectors=True) .framework("tf") ) algo = config.build() @@ -630,9 +630,9 @@ def test_multi_agent_sample_round_robin(self): env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( + .env_runners( rollout_fragment_length=50, - num_rollout_workers=0, + num_env_runners=0, ) .multi_agent( policies={"p0"}, @@ -703,9 +703,9 @@ def is_recurrent(self): env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=StatefulPolicy, config=( - AlgorithmConfig().rollouts( + AlgorithmConfig().env_runners( rollout_fragment_length=5, - num_rollout_workers=0, + num_env_runners=0, ) # Force `state_in_0` to be repeated every ts in the collected batch # (even though we don't even have a model that would care about this). @@ -781,9 +781,9 @@ def compute_actions_from_input_dict( default_policy_class=ModelBasedPolicy, config=DQNConfig() .framework("tf") - .rollouts( + .env_runners( rollout_fragment_length=5, - num_rollout_workers=0, + num_env_runners=0, enable_connectors=False, # only works with old episode API ) .multi_agent( @@ -808,7 +808,7 @@ def test_train_multi_agent_cartpole_single_policy(self): config = ( PPOConfig() .environment("multi_agent_cartpole") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .framework("tf") ) @@ -841,7 +841,7 @@ def gen_policy(): config = ( PPOConfig() .environment("multi_agent_cartpole") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .multi_agent( policies={ "policy_1": gen_policy(), diff --git a/rllib/env/tests/test_multi_agent_env_runner.py b/rllib/env/tests/test_multi_agent_env_runner.py index da65e5558c2e..d27f4779bd67 100644 --- a/rllib/env/tests/test_multi_agent_env_runner.py +++ b/rllib/env/tests/test_multi_agent_env_runner.py @@ -100,7 +100,7 @@ def _build_config(self): MultiAgentCartPole, env_config={"num_agents": 2}, ) - .rollouts(env_runner_cls=MultiAgentEnvRunner) + .env_runners(env_runner_cls=MultiAgentEnvRunner) # TODO (sven, simon): Setup is still for `Policy`, change as soon # as we have switched fully to the new stack. .multi_agent( diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py index 743e885291e3..2045963e899c 100644 --- a/rllib/env/tests/test_single_agent_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -18,7 +18,7 @@ def test_sample(self): config = ( AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. - .rollouts(num_envs_per_worker=2, rollout_fragment_length=64) + .env_runners(num_envs_per_worker=2, rollout_fragment_length=64) ) env_runner = SingleAgentEnvRunner(config=config) @@ -63,7 +63,7 @@ def test_distributed_env_runner(self): config = ( AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. - .rollouts( + .env_runners( num_rollout_workers=5, num_envs_per_worker=5, rollout_fragment_length=10, diff --git a/rllib/env/wrappers/model_vector_env.py b/rllib/env/wrappers/model_vector_env.py index 7456195b0d87..8facedab25e8 100644 --- a/rllib/env/wrappers/model_vector_env.py +++ b/rllib/env/wrappers/model_vector_env.py @@ -30,14 +30,14 @@ def model_vector_env(env: EnvType) -> BaseEnv: env = _VectorizedModelGymEnv( make_env=worker.make_sub_env_fn, existing_envs=[env], - num_envs=worker.config.num_envs_per_worker, + num_envs=worker.config.num_envs_per_env_runner, observation_space=env.observation_space, action_space=env.action_space, ) return convert_to_base_env( env, make_env=worker.make_sub_env_fn, - num_envs=worker.config.num_envs_per_worker, + num_envs=worker.config.num_envs_per_env_runner, remote_envs=False, remote_env_batch_wait_ms=0, ) diff --git a/rllib/evaluate.py b/rllib/evaluate.py index ad2a4b8abb28..dbfc483d793b 100755 --- a/rllib/evaluate.py +++ b/rllib/evaluate.py @@ -234,8 +234,8 @@ def run( env = config.get("env") # Make sure we have evaluation workers. - if not config.get("evaluation_num_workers"): - config["evaluation_num_workers"] = config.get("num_workers", 0) + if not config.get("evaluation_num_workers", config.get("evaluation_num_env_runners")): + config["evaluation_num_env_runners"] = config.get("num_workers", 0) if not config.get("evaluation_duration"): config["evaluation_duration"] = 1 diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index b234e0653906..f0623b3cbb00 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -134,9 +134,9 @@ def _update_env_seed_if_necessary( # rollout workers. max_num_envs_per_workers: int = 1000 assert ( - worker_idx < max_num_envs_per_workers + worker_idx < max_num_envs_per_env_runners ), "Too many envs per worker. Random seeds may collide." - computed_seed: int = worker_idx * max_num_envs_per_workers + vector_idx + seed + computed_seed: int = worker_idx * max_num_envs_per_env_runners + vector_idx + seed # Gymnasium.env. # This will silently fail for most Farama-foundation gymnasium environments. @@ -295,7 +295,7 @@ def gen_rollouts(): EnvRunner.__init__(self, config=config) self.num_workers = ( - num_workers if num_workers is not None else self.config.num_rollout_workers + num_workers if num_workers is not None else self.config.num_env_runners ) # In case we are reading from distributed datasets, store the shards here # and pick our shard by our worker-index. @@ -356,7 +356,7 @@ def gen_rollouts(): worker_index=self.worker_index ) self.total_rollout_fragment_length: int = ( - configured_rollout_fragment_length * self.config.num_envs_per_worker + configured_rollout_fragment_length * self.config.num_envs_per_env_runner ) self.preprocessing_enabled: bool = not config._disable_preprocessor_api self.last_batch: Optional[SampleBatchType] = None @@ -566,11 +566,11 @@ def wrap(env): # further clones of self.env and creates a RLlib BaseEnv (which is # vectorized under the hood). else: - # Always use vector env for consistency even if num_envs_per_worker=1. + # Always use vector env for consistency even if num_envs_per_env_runner=1. self.async_env: BaseEnv = convert_to_base_env( self.env, make_env=self.make_sub_env_fn, - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, remote_envs=self.config.remote_worker_envs, remote_env_batch_wait_ms=self.config.remote_env_batch_wait_ms, worker=self, diff --git a/rllib/evaluation/tests/test_env_runner_v2.py b/rllib/evaluation/tests/test_env_runner_v2.py index 473bee13ba3d..3d57be1e5939 100644 --- a/rllib/evaluation/tests/test_env_runner_v2.py +++ b/rllib/evaluation/tests/test_env_runner_v2.py @@ -58,9 +58,9 @@ def test_sample_batch_rollout_single_agent_env(self): # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -85,9 +85,9 @@ def test_sample_batch_rollout_multi_agent_env(self): # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -150,9 +150,9 @@ def compute_actions( PPOConfig() .framework("torch") .environment("env_under_test") - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, rollout_fragment_length=100, @@ -217,9 +217,9 @@ def __init__(self, *args, **kwargs): # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -291,9 +291,9 @@ def on_create_policy(self, *, policy_id, policy) -> None: .callbacks( callbacks_class=AddActionConnectorCallbacks, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -314,9 +314,9 @@ def test_start_episode(self): # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -370,9 +370,9 @@ def test_env_runner_output(self): # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) @@ -429,9 +429,9 @@ def on_episode_end( # Specifically ask for a batch of 200 samples. train_batch_size=200, ) - .rollouts( - num_envs_per_worker=1, - num_rollout_workers=0, + .env_runners( + num_envs_per_env_runner=1, + num_env_runners=0, # Enable EnvRunnerV2. enable_connectors=True, ) diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index d8f7d4c2d195..03f1b51f2130 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -29,7 +29,7 @@ def test_env_crash_during_pre_checking(self): """Expect the env pre-checking to fail on each worker.""" config = ( PPOConfig() - .rollouts(num_rollout_workers=2, num_envs_per_worker=4) + .env_runners(num_env_runners=2, num_envs_per_env_runner=4) .environment( env=CartPoleCrashing, env_config={ @@ -52,7 +52,7 @@ def test_env_crash_during_sampling(self): """Expect some sub-envs to fail (and not recover).""" config = ( PPOConfig() - .rollouts(num_rollout_workers=2, num_envs_per_worker=3) + .env_runners(num_env_runners=2, num_envs_per_env_runner=3) .environment( env=CartPoleCrashing, env_config={ @@ -84,9 +84,9 @@ def test_env_crash_on_one_worker_during_sampling_but_ignore(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( - num_rollout_workers=2, - num_envs_per_worker=3, + .env_runners( + num_env_runners=2, + num_envs_per_env_runner=3, # Ignore worker failures (continue with worker #2). ignore_worker_failures=True, ) @@ -120,11 +120,11 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( # env_runner_cls=ForwardHealthCheckToEnvWorker, - num_rollout_workers=2, + num_env_runners=2, rollout_fragment_length=10, - num_envs_per_worker=3, + num_envs_per_env_runner=3, # Re-create failed workers (then continue). recreate_failed_workers=True, ) @@ -164,9 +164,9 @@ def test_env_crash_during_sampling_but_restart_only_crashed_sub_env(self): """Expect sub-envs to fail (and not recover), but re-start them individually.""" config = ( PPOConfig() - .rollouts( - num_rollout_workers=2, - num_envs_per_worker=3, + .env_runners( + num_env_runners=2, + num_envs_per_env_runner=3, # Re-start failed individual sub-envs (then continue). # This means no workers will ever fail due to individual env errors # (only maybe for reasons other than the env). diff --git a/rllib/evaluation/tests/test_episode.py b/rllib/evaluation/tests/test_episode.py index 814e3598b9ae..d61e94cf3302 100644 --- a/rllib/evaluation/tests/test_episode.py +++ b/rllib/evaluation/tests/test_episode.py @@ -142,8 +142,8 @@ def test_single_agent_env(self): default_policy_class=EchoPolicy, # Episode only works with env runner v1. config=AlgorithmConfig() - .rollouts(enable_connectors=False) - .rollouts(num_rollout_workers=0) + .env_runners(enable_connectors=False) + .env_runners(num_env_runners=0) .callbacks(LastInfoCallback), ) ev.sample() @@ -154,8 +154,8 @@ def test_multi_agent_env(self): default_policy_class=EchoPolicy, # Episode only works with env runner v1. config=AlgorithmConfig() - .rollouts(enable_connectors=False) - .rollouts(num_rollout_workers=0) + .env_runners(enable_connectors=False) + .env_runners(num_env_runners=0) .callbacks(LastInfoCallback) .multi_agent( policies={str(agent_id) for agent_id in range(NUM_AGENTS)}, diff --git a/rllib/evaluation/tests/test_episode_v2.py b/rllib/evaluation/tests/test_episode_v2.py index c4d02adfa9cc..ee493b6c655f 100644 --- a/rllib/evaluation/tests/test_episode_v2.py +++ b/rllib/evaluation/tests/test_episode_v2.py @@ -77,9 +77,9 @@ def test_single_agent_env(self): ev = RolloutWorker( env_creator=lambda _: MockEnv3(NUM_STEPS), default_policy_class=EchoPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( enable_connectors=True, - num_rollout_workers=0, + num_env_runners=0, ), ) ma_batch = ev.sample() @@ -101,7 +101,7 @@ def test_multi_agent_env(self): str(agent_id) ), ) - .rollouts(enable_connectors=True, num_rollout_workers=0), + .env_runners(enable_connectors=True, num_env_runners=0), ) sample_batches = ev.sample() self.assertEqual(len(sample_batches.policy_batches), 4) diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 02a4b7b167ce..a9157bf4184c 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -103,7 +103,7 @@ def test_basic(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts(num_rollout_workers=0), + config=AlgorithmConfig().env_runners(num_env_runners=0), ) batch = convert_ma_batch_to_sample_batch(ev.sample()) for key in [ @@ -140,8 +140,8 @@ def test_batch_ids(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( - rollout_fragment_length=fragment_len, num_rollout_workers=0 + config=AlgorithmConfig().env_runners( + rollout_fragment_length=fragment_len, num_env_runners=0 ), ) batch1 = convert_ma_batch_to_sample_batch(ev.sample()) @@ -160,7 +160,7 @@ def test_global_vars_update(self): config = ( PPOConfig() .environment("CartPole-v1") - .rollouts(num_envs_per_worker=1) + .env_runners(num_envs_per_env_runner=1) # lr = 0.1 - [(0.1 - 0.000001) / 100000] * ts .training(lr_schedule=[[0, 0.1], [100000, 0.000001]]) ) @@ -194,7 +194,7 @@ def test_global_vars_update(self): def test_no_step_on_init(self): register_env("fail", lambda _: FailOnStepEnv()) - config = PPOConfig().environment("fail").rollouts(num_rollout_workers=2) + config = PPOConfig().environment("fail").env_runners(num_env_runners=2) for _ in framework_iterator(config): # We expect this to fail already on Algorithm init due # to the env sanity check right after env creation (inside @@ -209,9 +209,9 @@ def test_query_evaluators(self): config = ( PPOConfig() .environment("test") - .rollouts( - num_rollout_workers=2, - num_envs_per_worker=2, + .env_runners( + num_env_runners=2, + num_envs_per_env_runner=2, create_env_on_local_worker=True, ) .training(train_batch_size=20, sgd_minibatch_size=5, num_sgd_iter=1) @@ -254,7 +254,7 @@ def test_action_clipping(self): ) } ) - .rollouts(num_rollout_workers=0, batch_mode="complete_episodes") + .env_runners(num_env_runners=0, batch_mode="complete_episodes") .environment( action_space=action_space, normalize_actions=False, clip_actions=True ), @@ -286,7 +286,7 @@ def test_action_clipping(self): clip_actions=False, action_space=action_space, ) - .rollouts(batch_mode="complete_episodes", num_rollout_workers=0) + .env_runners(batch_mode="complete_episodes", num_env_runners=0) .multi_agent( policies={ "default_policy": PolicySpec( @@ -311,8 +311,8 @@ def test_action_clipping(self): ) ), default_policy_class=RandomPolicy, - config=AlgorithmConfig().rollouts( - num_rollout_workers=0, batch_mode="complete_episodes" + config=AlgorithmConfig().env_runners( + num_env_runners=0, batch_mode="complete_episodes" ) # Should not be a problem as RandomPolicy abides to bounds. .environment( @@ -346,7 +346,7 @@ def test_action_normalization(self): ) } ) - .rollouts(num_rollout_workers=0, batch_mode="complete_episodes") + .env_runners(num_env_runners=0, batch_mode="complete_episodes") .environment( action_space=action_space, normalize_actions=True, clip_actions=False ), @@ -411,8 +411,8 @@ def json_reader_creator(ioctx): env_creator=lambda _: env, default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=1, ) .environment( @@ -476,7 +476,7 @@ def step(self, action): } ) .environment(action_space=action_space, clip_actions=False) - .rollouts(batch_mode="complete_episodes", num_rollout_workers=0), + .env_runners(batch_mode="complete_episodes", num_env_runners=0), ) ev.sample() ev.stop() @@ -485,7 +485,7 @@ def test_reward_clipping(self): # Clipping: True (clip between -1.0 and 1.0). config = ( AlgorithmConfig() - .rollouts(num_rollout_workers=0, batch_mode="complete_episodes") + .env_runners(num_env_runners=0, batch_mode="complete_episodes") .environment(clip_rewards=True) ) ev = RolloutWorker( @@ -521,7 +521,7 @@ def test_reward_clipping(self): ), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts(num_rollout_workers=0, batch_mode="complete_episodes") + .env_runners(num_env_runners=0, batch_mode="complete_episodes") .environment(clip_rewards=2.0), ) sample = convert_ma_batch_to_sample_batch(ev2.sample()) @@ -536,7 +536,7 @@ def test_reward_clipping(self): env_creator=lambda _: MockEnv2(episode_length=10), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts(num_rollout_workers=0, batch_mode="complete_episodes") + .env_runners(num_env_runners=0, batch_mode="complete_episodes") .environment(clip_rewards=False), ) sample = convert_ma_batch_to_sample_batch(ev2.sample()) @@ -553,18 +553,18 @@ def test_metrics(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=100, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) remote_ev = ray.remote(RolloutWorker).remote( env_creator=lambda _: MockEnv(episode_length=10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=100, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) @@ -583,10 +583,10 @@ def test_auto_vectorization(self): ev = RolloutWorker( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=2, - num_envs_per_worker=8, - num_rollout_workers=0, + num_envs_per_env_runner=8, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -615,10 +615,10 @@ def test_batches_larger_when_vectorized(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=8), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=4, - num_envs_per_worker=4, - num_rollout_workers=0, + num_envs_per_env_runner=4, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -641,9 +641,9 @@ def test_vector_env_support(self): ev = RolloutWorker( env_creator=(lambda _: VectorizedMockEnv(episode_length=20, num_envs=8)), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=10, - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -668,9 +668,9 @@ def test_vector_env_support(self): ev = RolloutWorker( env_creator=(lambda _: MockVectorEnv(20, mocked_num_envs=4)), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=10, - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -694,7 +694,7 @@ def test_truncate_episodes(self): ev_env_steps = RolloutWorker( env_creator=lambda _: MockEnv(10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=15, num_rollout_workers=0, batch_mode="truncate_episodes", @@ -711,7 +711,7 @@ def test_truncate_episodes(self): env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( + .env_runners( num_rollout_workers=0, batch_mode="truncate_episodes", rollout_fragment_length=301, @@ -736,7 +736,7 @@ def test_truncate_episodes(self): env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts( + .env_runners( num_rollout_workers=0, rollout_fragment_length=301, ) @@ -764,7 +764,7 @@ def test_complete_episodes(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=5, num_rollout_workers=0, batch_mode="complete_episodes", @@ -778,7 +778,7 @@ def test_complete_episodes_packing(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=15, num_rollout_workers=0, batch_mode="complete_episodes", @@ -797,7 +797,7 @@ def test_filter_sync(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( num_rollout_workers=0, observation_filter="ConcurrentMeanStdFilter", ), @@ -814,7 +814,7 @@ def test_get_filters(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", num_rollout_workers=0, ), @@ -833,7 +833,7 @@ def test_sync_filter(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", num_rollout_workers=0, ), @@ -864,7 +864,7 @@ def test_extra_python_envs(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .python_environment(extra_python_environs_for_driver=extra_envs) - .rollouts(num_rollout_workers=0), + .env_runners(num_rollout_workers=0), ) self.assertTrue("env_key_1" in os.environ) self.assertTrue("env_key_2" in os.environ) @@ -878,7 +878,7 @@ def test_no_env_seed(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(20, mocked_num_envs=8), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts(num_rollout_workers=0).debugging(seed=1), + config=AlgorithmConfig().env_runners(num_rollout_workers=0).debugging(seed=1), ) assert not hasattr(ev.env, "seed") ev.stop() @@ -888,7 +888,7 @@ def test_multi_env_seed(self): env_creator=lambda _: MockEnv2(100), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts(num_envs_per_worker=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) .debugging(seed=1), ) # Make sure we can properly sample from the wrapped env. @@ -923,7 +923,7 @@ def step(self, action_dict): env_creator=lambda _: MockMultiAgentEnv(), default_policy_class=MockPolicy, config=AlgorithmConfig() - .rollouts(num_envs_per_worker=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) .multi_agent(policies={"policy_1", "policy_2"}) .debugging(seed=1), ) @@ -938,7 +938,7 @@ def test_wrap_multi_agent_env(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(10), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", num_rollout_workers=0, @@ -970,7 +970,7 @@ def step(self, action): ev = RolloutWorker( env_creator=lambda _: NoTrainingEnv(10, True), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", num_rollout_workers=0, @@ -985,7 +985,7 @@ def step(self, action): ev = RolloutWorker( env_creator=lambda _: NoTrainingEnv(10, False), default_policy_class=MockPolicy, - config=AlgorithmConfig().rollouts( + config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", num_rollout_workers=0, diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 16acebc3a2d8..3cd81eb6dce3 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -57,7 +57,7 @@ def test_traj_view_normal_case(self): """Tests, whether Model and Policy return the correct ViewRequirements.""" config = ( dqn.DQNConfig() - .rollouts(num_envs_per_worker=10, rollout_fragment_length=4) + .env_runners(num_envs_per_env_runner=10, rollout_fragment_length=4) .environment( "ray.rllib.examples.envs.classes.debug_counter_env.DebugCounterEnv" ) @@ -92,7 +92,7 @@ def test_traj_view_normal_case(self): rollout_worker = algo.workers.local_worker() sample_batch = rollout_worker.sample() sample_batch = convert_ma_batch_to_sample_batch(sample_batch) - expected_count = config.num_envs_per_worker * config.rollout_fragment_length + expected_count = config.num_envs_per_env_runner * config.rollout_fragment_length assert sample_batch.count == expected_count for v in sample_batch.values(): assert len(v) == expected_count @@ -115,7 +115,7 @@ def test_traj_view_lstm_prev_actions_and_rewards(self): "lstm_use_prev_reward": True, }, ) - .rollouts(create_env_on_local_worker=True) + .env_runners(create_env_on_local_worker=True) ) for _ in framework_iterator(config): @@ -190,7 +190,7 @@ def test_traj_view_attention_net(self): "ray.rllib.examples.envs.classes.debug_counter_env.DebugCounterEnv", env_config={"config": {"start_at_t": 1}}, # first obs is [1.0] ) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .callbacks(MyCallbacks) # Setup attention net. .training( @@ -229,7 +229,7 @@ def test_traj_view_next_action(self): ppo.PPOConfig() .experimental(_enable_new_api_stack=True) .framework("torch") - .rollouts(rollout_fragment_length=200, num_rollout_workers=0) + .env_runners(rollout_fragment_length=200, num_env_runners=0) ) config.validate() rollout_worker_w_api = RolloutWorker( @@ -312,8 +312,8 @@ def policy_fn(agent_id, episode, worker, **kwargs): model={"max_seq_len": max_seq_len}, train_batch_size=2010, ) - .rollouts( - num_rollout_workers=0, + .env_runners( + num_env_runners=0, rollout_fragment_length=rollout_fragment_length, ) .environment(normalize_actions=False) @@ -333,7 +333,7 @@ def test_counting_by_agent_steps(self): .experimental(_enable_new_api_stack=True) # Env setup. .environment(MultiAgentPendulum, env_config={"num_agents": num_agents}) - .rollouts(num_rollout_workers=2, rollout_fragment_length=21) + .env_runners(num_env_runners=2, rollout_fragment_length=21) .training(num_sgd_iter=2, train_batch_size=168) .framework("torch") .multi_agent( diff --git a/rllib/evaluation/tests/test_worker_set.py b/rllib/evaluation/tests/test_worker_set.py index c67d93a65a06..2d4b0ca71cd6 100644 --- a/rllib/evaluation/tests/test_worker_set.py +++ b/rllib/evaluation/tests/test_worker_set.py @@ -22,7 +22,7 @@ def test_foreach_worker(self): ws = WorkerSet( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=RandomPolicy, - config=AlgorithmConfig().rollouts(num_rollout_workers=2), + config=AlgorithmConfig().env_runners(num_env_runners=2), num_workers=2, ) @@ -51,7 +51,7 @@ def test_foreach_worker_return_obj_refss(self): ws = WorkerSet( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=RandomPolicy, - config=AlgorithmConfig().rollouts(num_rollout_workers=2), + config=AlgorithmConfig().env_runners(num_env_runners=2), num_workers=2, ) @@ -73,7 +73,7 @@ def test_foreach_worker_async(self): ws = WorkerSet( env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=RandomPolicy, - config=AlgorithmConfig().rollouts(num_rollout_workers=2), + config=AlgorithmConfig().env_runners(num_env_runners=2), num_workers=2, ) diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 912e77056d98..fd8f18b0932f 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -229,7 +229,7 @@ def _setup( # Create a number of @ray.remote workers. self.add_workers( num_workers, - validate=config.validate_workers_after_construction, + validate=config.validate_env_runners_after_construction, ) # If num_workers > 0 and we don't have an env on the local worker, diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py index b1a632106944..1c3c2d330b4c 100644 --- a/rllib/examples/_docs/rllib_on_rllib_readme.py +++ b/rllib/examples/_docs/rllib_on_rllib_readme.py @@ -59,7 +59,7 @@ def step(self, action): env_config={"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))}, ) # Parallelize environment rollouts. - .rollouts(num_rollout_workers=3) + .env_runners(num_rollout_workers=3) ) algo = config.build() diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py index eeba2ed026b5..6c408ba52d54 100644 --- a/rllib/examples/_old_api_stack/complex_struct_space.py +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -40,7 +40,7 @@ PPOConfig() .environment(SimpleRPG) .framework(args.framework) - .rollouts(rollout_fragment_length=1, num_rollout_workers=0) + .env_runners(rollout_fragment_length=1, num_rollout_workers=0) .training(train_batch_size=2, model={"custom_model": "my_model"}) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py index 3fff94e3839a..ae191e78513a 100644 --- a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py @@ -6,7 +6,7 @@ def create_appo_cartpole_checkpoint(output_dir): # enable_connectors defaults to True. Just trying to be explicit here. - config = APPOConfig().environment("CartPole-v1").rollouts(enable_connectors=True) + config = APPOConfig().environment("CartPole-v1").env_runners(enable_connectors=True) # Build algorithm object. algo = config.build() algo.save(checkpoint_dir=output_dir) @@ -22,7 +22,7 @@ def _policy_mapping_fn(*args, **kwargs): # Intentionally create a TF2 policy to demonstrate that we can restore # and use a TF policy in a Torch training stack. .framework("tf2") - .rollouts( + .env_runners( num_rollout_workers=1, num_envs_per_worker=5, # We will be restoring a TF2 policy. diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index 0c85f0fbcc78..275b928eb56a 100644 --- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -72,7 +72,7 @@ def main(checkpoint_dir): .environment("open_spiel_env") .framework("torch") .callbacks(partial(AddPolicyCallback, checkpoint_dir)) - .rollouts( + .env_runners( num_rollout_workers=1, num_envs_per_worker=5, # We will be restoring a TF2 policy. diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py index 811f00e9034b..16327b9b267c 100644 --- a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -115,7 +115,7 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None: .framework(args.framework) # Set up our own callbacks. .callbacks(TaskSettingCallback) - .rollouts( + .env_runners( # Force sub-envs to be ray.actor.ActorHandles, so we can step # through them in parallel. remote_worker_envs=True, diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index 25a0bbeb85ea..4cf507c411b6 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -124,7 +124,7 @@ def default_resource_request( PPOConfig() .environment("CartPole-v1") .framework(args.framework) - .rollouts( + .env_runners( # Force sub-envs to be ray.actor.ActorHandles, so we can step # through them in parallel. remote_worker_envs=True, diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 97ebf6a10d30..686bb08d7f9c 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -197,7 +197,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): .environment("multi_agent_cartpole") .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .rollouts(num_rollout_workers=0, rollout_fragment_length=50) + .env_runners(num_rollout_workers=0, rollout_fragment_length=50) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .reporting(metrics_num_episodes_for_smoothing=30) diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 6270b0eaa958..9c8fe8575854 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -63,7 +63,7 @@ config.training(num_sgd_iter=5, vf_loss_coeff=0.0001, train_batch_size=512) config.model["vf_share_layers"] = True elif args.run == "IMPALA": - config.rollouts(num_rollout_workers=2) + config.env_runners(num_env_runners=2) config.resources(num_gpus=0) config.training(vf_loss_coeff=0.01) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index 6e092c047b61..4f94994d8586 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -50,7 +50,7 @@ def _get_encoder_config( catalog_class=MobileNetEnhancedPPOCatalog ) ) - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 35a1efe051e2..8617e243e620 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -268,7 +268,7 @@ def get_default_policy_class(cls, config): PPOConfig() .environment(TwoStepGame) .framework(args.framework) - .rollouts(batch_mode="complete_episodes", num_rollout_workers=0) + .env_runners(batch_mode="complete_episodes", num_env_runners=0) .training(model={"custom_model": "cc_model"}) .multi_agent( policies={ diff --git a/rllib/examples/centralized_critic_2.py b/rllib/examples/centralized_critic_2.py index eb5749132da1..75e5f5e20cb3 100644 --- a/rllib/examples/centralized_critic_2.py +++ b/rllib/examples/centralized_critic_2.py @@ -127,9 +127,9 @@ def central_critic_observer(agent_obs, **kw): PPOConfig() .environment(TwoStepGame) .framework(args.framework) - .rollouts( + .env_runners( batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, # TODO(avnishn) make a new example compatible w connectors. enable_connectors=False, ) diff --git a/rllib/examples/checkpoints/onnx_tf.py b/rllib/examples/checkpoints/onnx_tf.py index 3bcd8426c93d..0093afd0fd9e 100644 --- a/rllib/examples/checkpoints/onnx_tf.py +++ b/rllib/examples/checkpoints/onnx_tf.py @@ -26,7 +26,7 @@ ppo.PPOConfig() # ONNX is not supported by RLModule API yet. .experimental(_enable_new_api_stack=False) - .rollouts(num_rollout_workers=1) + .env_runners(num_rollout_workers=1) .framework(args.framework) ) diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index aefa77024e6a..008be01378a7 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -15,7 +15,7 @@ ppo.PPOConfig() # ONNX is not supported by RLModule API yet. .experimental(_enable_new_api_stack=False) - .rollouts(num_rollout_workers=1) + .env_runners(num_rollout_workers=1) .framework("torch") ) diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index 10b1506c13fa..9c316e591d3a 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -111,14 +111,14 @@ def _env_creator(cfg): }, clip_rewards=True, ) - .rollouts( + .env_runners( # ... new EnvRunner and our frame stacking env-to-module connector. env_to_module_connector=( None if args.use_gym_wrapper_framestacking else _make_env_to_module_connector ), - num_envs_per_worker=1 if args.num_agents > 0 else 2, + num_envs_per_env_runner=1 if args.num_agents > 0 else 2, ) .training( # Use our frame stacking learner connector. diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index 09f8d8d712a5..af5172117b09 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -30,11 +30,11 @@ get_trainable_cls(args.algo) .get_default_config() .environment("env" if args.num_agents > 0 else "Pendulum-v1") - .rollouts( + .env_runners( # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's # MultiAgentEnv API. - num_envs_per_worker=1 if args.num_agents > 0 else 20, + num_envs_per_env_runner=1 if args.num_agents > 0 else 20, # Define a single connector piece to be prepended to the env-to-module # connector pipeline. # Alternatively, return a list of n ConnectorV2 pieces (which will then be @@ -55,7 +55,7 @@ vf_loss_coeff=0.01, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_interval=1, evaluation_duration=10, diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/connectors/nested_action_spaces.py index c3c42fb6c1f5..830b87fb25fb 100644 --- a/rllib/examples/connectors/nested_action_spaces.py +++ b/rllib/examples/connectors/nested_action_spaces.py @@ -65,7 +65,7 @@ def _env_to_module_pipeline(env): "episode_len": 100, }, ) - .rollouts(env_to_module_connector=_env_to_module_pipeline) + .env_runners(env_to_module_connector=_env_to_module_pipeline) # No history in Env (bandit problem). .training( gamma=0.0, diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/nested_observation_spaces.py index ae7b62b25082..39a4bac1c585 100644 --- a/rllib/examples/connectors/nested_observation_spaces.py +++ b/rllib/examples/connectors/nested_observation_spaces.py @@ -48,7 +48,7 @@ def _env_to_module_pipeline(env): get_trainable_cls(args.algo) .get_default_config() .environment("env") - .rollouts(env_to_module_connector=_env_to_module_pipeline) + .env_runners(env_to_module_connector=_env_to_module_pipeline) .training( gamma=0.99, lr=0.0003, diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index ee73e68f2e24..0c3a2693cca2 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -57,7 +57,7 @@ def _env_to_module(env): config = ( PPOConfig() .environment("env") - .rollouts(env_to_module_connector=_env_to_module) + .env_runners(env_to_module_connector=_env_to_module) .training( num_sgd_iter=6, lr=0.0003, diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 20d92a64f9c2..4b78003a73d2 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -212,7 +212,7 @@ def on_train_result( lr=0.0002, model={"vf_share_layers": True}, ) - .rollouts( + .env_runners( num_envs_per_worker=5, env_to_module_connector=lambda env: [ AddObservationsFromEpisodesToBatch(), diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/custom_metrics_and_callbacks.py index 3c5228433b7c..15e660273927 100644 --- a/rllib/examples/custom_metrics_and_callbacks.py +++ b/rllib/examples/custom_metrics_and_callbacks.py @@ -189,7 +189,7 @@ def on_postprocess_trajectory( .framework(args.framework) .callbacks(MyCallbacks) .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) - .rollouts(enable_connectors=False) + .env_runners(enable_connectors=False) .reporting(keep_per_episode_custom_metrics=True) ) diff --git a/rllib/examples/custom_model_loss_and_metrics.py b/rllib/examples/custom_model_loss_and_metrics.py index ff8b890173b7..a5bf70ed2c05 100644 --- a/rllib/examples/custom_model_loss_and_metrics.py +++ b/rllib/examples/custom_model_loss_and_metrics.py @@ -74,7 +74,7 @@ .get_default_config() .environment("CartPole-v1") .framework(args.framework) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( model={ "custom_model": "custom_loss", diff --git a/rllib/examples/custom_recurrent_rnn_tokenizer.py b/rllib/examples/custom_recurrent_rnn_tokenizer.py index e662e71082b2..fea2f589085f 100644 --- a/rllib/examples/custom_recurrent_rnn_tokenizer.py +++ b/rllib/examples/custom_recurrent_rnn_tokenizer.py @@ -165,7 +165,7 @@ def get_tokenizer_config( PPOConfig() .environment(args.env, env_config={"repeat_delay": 2}) .framework(args.framework) - .rollouts(num_rollout_workers=0, num_envs_per_worker=20) + .env_runners(num_env_runners=0, num_envs_per_env_runner=20) .training( model={ "vf_share_layers": False, diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index a8ff486aecb6..8b819941c98b 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -39,7 +39,7 @@ env_config={"param_server": "param-server"}, ) .framework(args.framework) - .rollouts( + .env_runners( num_rollout_workers=1, num_envs_per_worker=2, rollout_fragment_length=50, diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py index 779d8935886f..96b30dbcff2b 100644 --- a/rllib/examples/envs/env_rendering_and_recording.py +++ b/rllib/examples/envs/env_rendering_and_recording.py @@ -101,7 +101,7 @@ def render(self, mode="rgb"): ) .framework(args.framework) # Use a vectorized env with 2 sub-envs. - .rollouts(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_worker=2, num_rollout_workers=1) .evaluation( # Evaluate once per training iteration. evaluation_interval=1, @@ -110,7 +110,7 @@ def render(self, mode="rgb"): # ... using one evaluation worker (setting this to 0 will cause # evaluation to run on the local evaluation worker, blocking # training until evaluation is done). - evaluation_num_workers=1, + evaluation_num_env_runners=1, # Special evaluation config. Keys specified here will override # the same keys in the main config, but only for evaluation. evaluation_config=PPOConfig.overrides( diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py index 8fe64b4576ef..764df7e3721c 100755 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ b/rllib/examples/envs/external_envs/cartpole_server.py @@ -173,7 +173,7 @@ def _input(ioctx): # Use the `PolicyServerInput` to generate experiences. .offline_data(input_=_input) # Use n worker processes to listen on different ports. - .rollouts( + .env_runners( num_rollout_workers=args.num_workers, # Connectors are not compatible with the external env. enable_connectors=False, diff --git a/rllib/examples/envs/external_envs/unity3d_server.py b/rllib/examples/envs/external_envs/unity3d_server.py index 10c642627f3c..00129aea074b 100755 --- a/rllib/examples/envs/external_envs/unity3d_server.py +++ b/rllib/examples/envs/external_envs/unity3d_server.py @@ -132,7 +132,7 @@ def _input(ioctx): # DL framework to use. .framework(args.framework) # Use n worker processes to listen on different ports. - .rollouts( + .env_runners( num_rollout_workers=args.num_workers, rollout_fragment_length=20, enable_connectors=False, diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py index 495e0ac684dd..4ffe4797270f 100644 --- a/rllib/examples/envs/greyscale_env.py +++ b/rllib/examples/envs/greyscale_env.py @@ -82,7 +82,7 @@ def env_creator(config): config = ( PPOConfig() .environment("pistonball", env_config={"local_ratio": 0.5}, clip_rewards=True) - .rollouts( + .env_runners( num_rollout_workers=15 if not args.as_test else 2, num_envs_per_worker=1, observation_filter="NoFilter", diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py index 752e612e7c25..58aee8db0cf4 100644 --- a/rllib/examples/envs/unity3d_env_local.py +++ b/rllib/examples/envs/unity3d_env_local.py @@ -131,7 +131,7 @@ .framework("tf" if args.env != "Pyramids" else "torch") # For running in editor, force to use just one Worker (we only have # one Unity running)! - .rollouts( + .env_runners( num_rollout_workers=args.num_workers if args.file_name else 0, rollout_fragment_length=200, ) @@ -153,7 +153,7 @@ # Switch on Curiosity based exploration for Pyramids env # (not solvable otherwise). if args.env == "Pyramids": - config.exploration( + config.env_runners( exploration_config={ "type": "Curiosity", "eta": 0.1, diff --git a/rllib/examples/evaluation/custom_evaluation.py b/rllib/examples/evaluation/custom_evaluation.py index 89640112a76d..b3101d1a27f9 100644 --- a/rllib/examples/evaluation/custom_evaluation.py +++ b/rllib/examples/evaluation/custom_evaluation.py @@ -157,7 +157,7 @@ def custom_eval_function(algorithm: Algorithm, eval_workers: WorkerSet) -> Resul None if args.no_custom_eval else custom_eval_function ), # Number of eval EnvRunners to use. - evaluation_num_workers=2, + evaluation_num_env_runners=2, # Enable evaluation, once per training iteration. evaluation_interval=1, # Run 10 episodes each time evaluation runs (OR "auto" if parallel to diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index a3849f25938d..150c254a4cb1 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -120,7 +120,7 @@ class AssertEvalCallback(DefaultCallbacks): def on_train_result(self, *, algorithm, result, **kwargs): # Make sure we always run exactly the given evaluation duration, # no matter what the other settings are (such as - # `evaluation_num_workers` or `evaluation_parallel_to_training`). + # `evaluation_num_env_runners` or `evaluation_parallel_to_training`). if ( "evaluation" in result and "hist_stats" in result["evaluation"]["sampler_results"] @@ -139,7 +139,7 @@ def on_train_result(self, *, algorithm, result, **kwargs): # fetch. assert ( num_timesteps_reported == 0 - or num_timesteps_reported >= algorithm.config.evaluation_num_workers + or num_timesteps_reported >= algorithm.config.evaluation_num_env_runners ) # We count in episodes. elif algorithm.config.evaluation_duration_unit == "episodes": @@ -202,7 +202,7 @@ def on_train_result(self, *, algorithm, result, **kwargs): ), # Use two evaluation workers. Must be >0, otherwise, # evaluation will run on a local worker and block (no parallelism). - evaluation_num_workers=args.evaluation_num_workers, + evaluation_num_env_runners=args.evaluation_num_env_runners, # Evaluate every other training iteration (together # with every other call to Algorithm.train()). evaluation_interval=args.evaluation_interval, diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py index bffb30b54f41..4965bba3c4c8 100644 --- a/rllib/examples/gpus/fractional_gpus.py +++ b/rllib/examples/gpus/fractional_gpus.py @@ -96,7 +96,7 @@ ) # How many RolloutWorkers (each with n environment copies: # `num_envs_per_worker`)? - .rollouts( + .env_runners( num_rollout_workers=args.num_workers, # This setting should not really matter as it does not affect the # number of GPUs reserved for each worker. diff --git a/rllib/examples/hierarchical/hierarchical_training.py b/rllib/examples/hierarchical/hierarchical_training.py index cb75f4adfddb..8b34a1f11cef 100644 --- a/rllib/examples/hierarchical/hierarchical_training.py +++ b/rllib/examples/hierarchical/hierarchical_training.py @@ -86,7 +86,7 @@ param_space=( PPOConfig() .environment(WindyMazeEnv) - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) .framework(args.framework) ).to_dict(), ).fit() @@ -103,7 +103,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): PPOConfig() .environment(HierarchicalWindyMazeEnv) .framework(args.framework) - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) .training(entropy_coeff=0.01) .multi_agent( policies={ diff --git a/rllib/examples/multi_agent/multi_agent_cartpole.py b/rllib/examples/multi_agent/multi_agent_cartpole.py index e19d70f7b683..b26f9b6ecb1b 100644 --- a/rllib/examples/multi_agent/multi_agent_cartpole.py +++ b/rllib/examples/multi_agent/multi_agent_cartpole.py @@ -47,7 +47,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("env" if args.num_agents > 0 else "CartPole-v1") - .rollouts( + .env_runners( # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's # MultiAgentEnv API. diff --git a/rllib/examples/multi_agent/multi_agent_pendulum.py b/rllib/examples/multi_agent/multi_agent_pendulum.py index 31d566b542cd..00e73bafd3c5 100644 --- a/rllib/examples/multi_agent/multi_agent_pendulum.py +++ b/rllib/examples/multi_agent/multi_agent_pendulum.py @@ -47,7 +47,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("env" if args.num_agents > 0 else "Pendulum-v1") - .rollouts(num_rollout_workers=4) + .env_runners(num_rollout_workers=4) .training( train_batch_size_per_learner=512, mini_batch_size_per_learner=64, diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py index 0548a428e748..0da454bfcb10 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -82,7 +82,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("RockPaperScissors") - .rollouts( + .env_runners( env_to_module_connector=lambda env: ( AddObservationsFromEpisodesToBatch(), # Only flatten obs for the learning RLModul diff --git a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py index 7dae9d516b3c..507c018babc8 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py @@ -61,7 +61,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("RockPaperScissors") - .rollouts( + .env_runners( env_to_module_connector=lambda env: ( AddObservationsFromEpisodesToBatch(), FlattenObservations(multi_agent=True), diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index 8ec58637d7bb..8888cb33e708 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -174,7 +174,7 @@ def _get_multi_agent(): win_rate_threshold=args.win_rate_threshold, ) ) - .rollouts( + .env_runners( num_rollout_workers=args.num_env_runners, num_envs_per_worker=1 if args.enable_new_api_stack else 5, # Set up the correct env-runner to use depending on diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index 225b0d0afa75..5ad574f19980 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -122,7 +122,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): win_rate_threshold=args.win_rate_threshold, ) ) - .rollouts( + .env_runners( num_rollout_workers=args.num_env_runners, num_envs_per_worker=1 if args.enable_new_api_stack else 5, # Set up the correct env-runner to use depending on diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py index 21bedf482363..43e75b4b414f 100644 --- a/rllib/examples/multi_agent/two_algorithms.py +++ b/rllib/examples/multi_agent/two_algorithms.py @@ -87,7 +87,7 @@ def select_policy(algorithm, framework): .framework(args.framework) # disable filters, otherwise we would need to synchronize those # as well to the DQN agent - .rollouts(observation_filter="MeanStdFilter") + .env_runners(observation_filter="MeanStdFilter") .training( model={"vf_share_layers": True}, vf_loss_coeff=0.01, @@ -103,7 +103,7 @@ def select_policy(algorithm, framework): .framework(args.framework) # disable filters, otherwise we would need to synchronize those # as well to the DQN agent - .rollouts(observation_filter="MeanStdFilter") + .env_runners(observation_filter="MeanStdFilter") .training( model={"vf_share_layers": True}, n_step=3, diff --git a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py index 8723285cdc8a..afabd3fe9003 100644 --- a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py +++ b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py @@ -75,7 +75,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("grouped_twostep") - .rollouts( + .env_runners( env_to_module_connector=lambda env: ( AddObservationsFromEpisodesToBatch(), FlattenObservations(multi_agent=True), diff --git a/rllib/examples/offline_rl/custom_input_api.py b/rllib/examples/offline_rl/custom_input_api.py index c4e756dc09fd..4b96951cc5fb 100644 --- a/rllib/examples/offline_rl/custom_input_api.py +++ b/rllib/examples/offline_rl/custom_input_api.py @@ -99,7 +99,7 @@ def input_creator(ioctx: IOContext) -> InputReader: .training(train_batch_size=2000) .evaluation( evaluation_interval=1, - evaluation_num_workers=2, + evaluation_num_env_runners=2, evaluation_duration=10, evaluation_parallel_to_training=True, evaluation_config=default_config.overrides( diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index de791921a119..4d4f0803cf45 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -53,7 +53,7 @@ config = ( cql.CQLConfig() .framework(framework="torch") - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) .training( n_step=3, bc_iters=0, @@ -87,7 +87,7 @@ } ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_duration=10, evaluation_parallel_to_training=False, diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 35464f66667b..6293d8f43a9a 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -94,7 +94,7 @@ def my_experiment(config: Dict): # Set the number of EnvRunners for collecting training data to 0 (local # worker only). - config.rollouts(num_rollout_workers=0) + config.env_runners(num_rollout_workers=0) eval_algo = config.build() # Load state from the low-lr algo into this one. @@ -157,7 +157,7 @@ def my_experiment(config: Dict): PPOConfig() .experimental(_enable_new_api_stack=True) .environment("CartPole-v1") - .rollouts( + .env_runners( num_rollout_workers=0, env_runner_cls=SingleAgentEnvRunner, ) diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index f2988bcbd6c4..bbd3869e4477 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -84,7 +84,7 @@ def flush(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=SingleAgentEnvRunner) + .env_runners(env_runner_cls=SingleAgentEnvRunner) .environment("CartPole-v1") # Setting up a custom logger config. # ---------------------------------- diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index 01244f8deb71..8a487fc9a436 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -90,7 +90,7 @@ config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(env_runner_cls=MultiAgentEnvRunner) + .env_runners(env_runner_cls=MultiAgentEnvRunner) .environment("env") .multi_agent( # Define 3 policies. Note that in our simple setup, they are all configured diff --git a/rllib/examples/replay_buffer_api.py b/rllib/examples/replay_buffer_api.py index f0631135e17b..aec40862223c 100644 --- a/rllib/examples/replay_buffer_api.py +++ b/rllib/examples/replay_buffer_api.py @@ -57,7 +57,7 @@ DQNConfig() .environment("CartPole-v1") .framework(framework=args.framework) - .rollouts(num_rollout_workers=4) + .env_runners(num_env_runners=4) .training( model=dict(use_lstm=True, lstm_cell_size=64, max_seq_len=20), replay_buffer_config=replay_buffer_config, diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index 5bbc75f4c4e2..b3827fd010bc 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -73,7 +73,7 @@ def setup(self): ), }, ) - .rollouts(num_rollout_workers=0) + .env_runners(num_rollout_workers=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/execution/rollout_ops.py b/rllib/execution/rollout_ops.py index e88321e24713..413f7845c86c 100644 --- a/rllib/execution/rollout_ops.py +++ b/rllib/execution/rollout_ops.py @@ -114,7 +114,7 @@ def synchronous_parallel_sample( "No samples returned from remote workers. If you have a " "slow environment or model, consider increasing the " "`sample_timeout_s` or decreasing the " - "`rollout_fragment_length` in `AlgorithmConfig.rollouts()." + "`rollout_fragment_length` in `AlgorithmConfig.env_runners()." ) elif worker_set.num_healthy_remote_workers() <= 0: logger.warning( diff --git a/rllib/models/tests/test_attention_nets.py b/rllib/models/tests/test_attention_nets.py index 14329d6a85dd..c57d36ff4364 100644 --- a/rllib/models/tests/test_attention_nets.py +++ b/rllib/models/tests/test_attention_nets.py @@ -16,7 +16,7 @@ class TestAttentionNets(unittest.TestCase): config = { "env": StatelessCartPole, "gamma": 0.99, - "num_envs_per_worker": 20, + "num_envs_per_env_runner": 20, "framework": "tf", } diff --git a/rllib/models/tests/test_lstms.py b/rllib/models/tests/test_lstms.py index 38ddac08e7d9..b336adcb6eae 100644 --- a/rllib/models/tests/test_lstms.py +++ b/rllib/models/tests/test_lstms.py @@ -54,9 +54,9 @@ def test_lstm_w_prev_action_and_prev_reward(self): train_batch_size=200, sgd_minibatch_size=50, ) - .rollouts( + .env_runners( rollout_fragment_length=100, - num_rollout_workers=1, + num_env_runners=1, ) .experimental( _disable_action_flattening=True, diff --git a/rllib/models/tests/test_models.py b/rllib/models/tests/test_models.py index a2215bbb6ca2..86100e86b690 100644 --- a/rllib/models/tests/test_models.py +++ b/rllib/models/tests/test_models.py @@ -64,7 +64,7 @@ def test_modelv3(self): ppo.PPOConfig() .environment("CartPole-v1") .framework("tf") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( model={ "custom_model": RNNModel, diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index b03327a14a5f..cfe8cec79155 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -50,7 +50,7 @@ def test_rlms_and_preprocessing(self): }, ) # Run this very quickly locally. - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( train_batch_size=10, sgd_minibatch_size=1, @@ -91,7 +91,7 @@ def test_preprocessing_disabled_modelv2(self): }, ) # Speed things up a little. - .rollouts(rollout_fragment_length=5) + .env_runners(rollout_fragment_length=5) .training(train_batch_size=100, sgd_minibatch_size=10, num_sgd_iter=1) .debugging(seed=42) # Set this to True to enforce no preprocessors being used. diff --git a/rllib/offline/estimators/tests/test_ope.py b/rllib/offline/estimators/tests/test_ope.py index 062be5f9797e..3b28b18a6b2b 100644 --- a/rllib/offline/estimators/tests/test_ope.py +++ b/rllib/offline/estimators/tests/test_ope.py @@ -124,7 +124,7 @@ def setUpClass(cls): DQNConfig() .environment(env=env_name) .framework("torch") - .rollouts(batch_mode="complete_episodes") + .env_runners(batch_mode="complete_episodes") .offline_data( input_="dataset", input_config={"format": "json", "paths": train_data}, @@ -132,7 +132,7 @@ def setUpClass(cls): .evaluation( evaluation_interval=1, evaluation_duration=n_episodes, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_duration_unit="episodes", off_policy_estimation_methods={ "is": {"type": ImportanceSampling, "epsilon_greedy": 0.1}, @@ -144,8 +144,8 @@ def setUpClass(cls): .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", 0))) ) - num_rollout_workers = 4 - dsize = num_rollout_workers * 1024 + num_env_runners = 4 + dsize = num_env_runners * 1024 feature_dim = 64 action_dim = 8 @@ -158,7 +158,7 @@ def setUpClass(cls): cls.train_df = pd.DataFrame({k: list(v) for k, v in data.items()}) cls.train_df["type"] = "SampleBatch" - train_ds = ray.data.from_pandas(cls.train_df).repartition(num_rollout_workers) + train_ds = ray.data.from_pandas(cls.train_df).repartition(num_env_runners) cls.dqn_on_fake_ds = ( DQNConfig() @@ -166,15 +166,15 @@ def setUpClass(cls): observation_space=gym.spaces.Box(-1, 1, (feature_dim,)), action_space=gym.spaces.Discrete(action_dim), ) - .rollouts(num_rollout_workers=num_rollout_workers) + .env_runners(num_env_runners=num_env_runners) .framework("torch") - # .rollouts(num_rollout_workers=num_rollout_workers) + # .env_runners(num_env_runners=num_env_runners) .offline_data( input_="dataset", input_config={"loader_fn": lambda: train_ds}, ) .evaluation( - evaluation_num_workers=num_rollout_workers, + evaluation_num_env_runners=num_env_runners, ope_split_batch_by_episode=False, ) # make the policy deterministic diff --git a/rllib/offline/estimators/tests/utils.py b/rllib/offline/estimators/tests/utils.py index 2f28da1c7a25..bacd983892eb 100644 --- a/rllib/offline/estimators/tests/utils.py +++ b/rllib/offline/estimators/tests/utils.py @@ -44,7 +44,7 @@ def get_cliff_walking_wall_policy_and_data( config = ( AlgorithmConfig() .debugging(seed=seed) - .rollouts(batch_mode="complete_episodes") + .env_runners(batch_mode="complete_episodes") .experimental(_disable_preprocessor_api=True) ) config = config.to_dict() diff --git a/rllib/offline/tests/test_dataset_reader.py b/rllib/offline/tests/test_dataset_reader.py index 9557f68cb4f1..b8825c49a307 100644 --- a/rllib/offline/tests/test_dataset_reader.py +++ b/rllib/offline/tests/test_dataset_reader.py @@ -91,7 +91,7 @@ def test_dataset_shard_with_task_parallelization(self): "paths": self.dset_path, }, ) - .rollouts(num_rollout_workers=10) + .env_runners(num_env_runners=10) ) NUM_WORKERS = 4 diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 66743bb1ae8f..5277bc5c87b0 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -70,7 +70,7 @@ def do_test_log_likelihood( ): config = config.copy(copy_frozen=False) # Run locally. - config.num_rollout_workers = 0 + config.num_env_runners = 0 # Env setup. if continuous: config.env = "Pendulum-v1" @@ -159,7 +159,7 @@ def test_dqn(self): """Tests, whether DQN correctly computes logp in soft-q mode.""" config = dqn.DQNConfig() # Soft-Q for DQN. - config.exploration(exploration_config={"type": "SoftQ", "temperature": 0.5}) + config.env_runners(exploration_config={"type": "SoftQ", "temperature": 0.5}) config.debugging(seed=42) do_test_log_likelihood(dqn.DQN, config) diff --git a/rllib/policy/tests/test_policy_checkpoint_restore.py b/rllib/policy/tests/test_policy_checkpoint_restore.py index e7371bb428f9..dc9f22e3c3ba 100644 --- a/rllib/policy/tests/test_policy_checkpoint_restore.py +++ b/rllib/policy/tests/test_policy_checkpoint_restore.py @@ -16,7 +16,7 @@ def _do_checkpoint_twice_test(framework): # Checks if we can load a policy from a checkpoint (at least) twice config = ( - PPOConfig().rollouts(num_rollout_workers=0).evaluation(evaluation_num_workers=0) + PPOConfig().env_runners(num_env_runners=0).evaluation(evaluation_num_env_runners=0) ) for fw in framework_iterator(config, frameworks=[framework]): algo1 = config.build(env="CartPole-v1") @@ -60,7 +60,7 @@ def test_policy_from_checkpoint_twice_torch(self): def test_add_policy_connector_enabled(self): with tempfile.TemporaryDirectory() as tmpdir: config = ( - APPOConfig().environment("CartPole-v1").rollouts(enable_connectors=True) + APPOConfig().environment("CartPole-v1").env_runners(enable_connectors=True) ) algo = config.build() algo.train() @@ -109,10 +109,10 @@ def test_restore_checkpoint_with_nested_obs_space(self): .environment( observation_space=obs_space, action_space=gym.spaces.Discrete(2) ) - # Note (Artur): We have to choose num_rollout_workers=0 here, because + # Note (Artur): We have to choose num_env_runners=0 here, because # otherwise RolloutWorker will be health-checked without an env which # raises an error. You could also disable the health-check here. - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .build() .get_policy() ) diff --git a/rllib/tests/backward_compat/test_backward_compat.py b/rllib/tests/backward_compat/test_backward_compat.py index 80114764bfeb..cd5230598cc6 100644 --- a/rllib/tests/backward_compat/test_backward_compat.py +++ b/rllib/tests/backward_compat/test_backward_compat.py @@ -117,7 +117,7 @@ def test_old_algorithm_config_dicts(self): "num_envs_per_worker": 4, "explore": False, }, - "evaluation_num_workers": 1, + "evaluation_num_env_runners": 1, "multiagent": { "policies": { "policy1": PolicySpec(), @@ -128,7 +128,7 @@ def test_old_algorithm_config_dicts(self): } algo = DQN(config=config) self.assertTrue(algo.config.lr == 0.001) - self.assertTrue(algo.config.evaluation_num_workers == 1) + self.assertTrue(algo.config.evaluation_num_env_runners == 1) self.assertTrue(list(algo.config.policies.keys()) == ["policy1"]) self.assertTrue(algo.config.explore is True) self.assertTrue(algo.evaluation_config.explore is False) diff --git a/rllib/tests/test_algorithm_checkpoint_restore.py b/rllib/tests/test_algorithm_checkpoint_restore.py index d6017e8e9975..2715a2f0129b 100644 --- a/rllib/tests/test_algorithm_checkpoint_restore.py +++ b/rllib/tests/test_algorithm_checkpoint_restore.py @@ -19,7 +19,7 @@ algorithms_and_configs = { "DQN": ( DQNConfig() - .exploration(explore=False) + .env_runners(explore=False) .training(num_steps_sampled_before_learning_starts=0) .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), @@ -28,10 +28,10 @@ # explore is set to None for PPO in favor of RLModule API support. PPOConfig() .training(num_sgd_iter=5, train_batch_size=1000) - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_config=PPOConfig.overrides( # Define a (slightly different mapping function to test, whether eval @@ -47,7 +47,7 @@ ), "SAC": ( SACConfig() - .exploration(explore=False) + .env_runners(explore=False) .training(num_steps_sampled_before_learning_starts=0) .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), diff --git a/rllib/tests/test_algorithm_rl_module_restore.py b/rllib/tests/test_algorithm_rl_module_restore.py index 982beb0f41a4..0f1c2f616210 100644 --- a/rllib/tests/test_algorithm_rl_module_restore.py +++ b/rllib/tests/test_algorithm_rl_module_restore.py @@ -51,7 +51,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(rollout_fragment_length=4) + .env_runners(rollout_fragment_length=4) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) @@ -189,7 +189,7 @@ def test_e2e_load_rl_module(self): config = ( PPOConfig() .experimental(_enable_new_api_stack=True) - .rollouts(rollout_fragment_length=4) + .env_runners(rollout_fragment_length=4) .environment("CartPole-v1") .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) .resources(**scaling_config) diff --git a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py index b9a430242a5e..02467b60858d 100644 --- a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py @@ -33,7 +33,7 @@ def save_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): """ algo_cfg = ( algo_cfg.experimental(_enable_new_api_stack=True) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at # least once @@ -69,7 +69,7 @@ def load_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): """ algo_cfg = ( algo_cfg.experimental(_enable_new_api_stack=True) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at # least once diff --git a/rllib/tests/test_custom_resource.py b/rllib/tests/test_custom_resource.py index fe87e19bffe6..27cd495fa46c 100644 --- a/rllib/tests/test_custom_resource.py +++ b/rllib/tests/test_custom_resource.py @@ -21,7 +21,7 @@ def test_custom_resource(algorithm): .get_default_config() .environment("CartPole-v1") .framework("torch") - .rollouts(num_rollout_workers=1) + .env_runners(num_env_runners=1) .resources(num_gpus=0, custom_resources_per_worker={"custom_resource": 0.01}) ) stop = {"training_iteration": 1} diff --git a/rllib/tests/test_dependency_tf.py b/rllib/tests/test_dependency_tf.py index 87eb513f7170..5e51f5178791 100644 --- a/rllib/tests/test_dependency_tf.py +++ b/rllib/tests/test_dependency_tf.py @@ -24,7 +24,7 @@ PPOConfig() .environment("CartPole-v1") .framework("torch") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) ) # Note: No ray.init(), to test it works without Ray algo = config.build() diff --git a/rllib/tests/test_dependency_torch.py b/rllib/tests/test_dependency_torch.py index df5239a8e85c..7048d0f92cf2 100755 --- a/rllib/tests/test_dependency_torch.py +++ b/rllib/tests/test_dependency_torch.py @@ -23,7 +23,7 @@ PPOConfig() .environment("CartPole-v1") .framework("tf") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) # Disable the logger due to a sort-import attempt of torch # inside the tensorboardX.SummaryWriter class. .debugging(logger_config={"type": "ray.tune.logger.NoopLogger"}) diff --git a/rllib/tests/test_gpus.py b/rllib/tests/test_gpus.py index aca735ae90c4..3fb0aa1059a5 100644 --- a/rllib/tests/test_gpus.py +++ b/rllib/tests/test_gpus.py @@ -18,7 +18,7 @@ def test_gpus_in_non_local_mode(self): actual_gpus = torch.cuda.device_count() print(f"Actual GPUs found (by torch): {actual_gpus}") - config = PPOConfig().rollouts(num_rollout_workers=2).environment("CartPole-v1") + config = PPOConfig().env_runners(num_env_runners=2).environment("CartPole-v1") # Expect errors when we run a config w/ num_gpus>0 w/o a GPU # and _fake_gpus=False. @@ -88,7 +88,7 @@ def test_gpus_in_local_mode(self): actual_gpus_available = torch.cuda.device_count() - config = PPOConfig().rollouts(num_rollout_workers=2).environment("CartPole-v1") + config = PPOConfig().env_runners(num_env_runners=2).environment("CartPole-v1") # Expect no errors in local mode. for num_gpus in [0, 0.1, 1, actual_gpus_available + 4]: diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index a5ef2e6a4d28..cdbc804641e4 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -260,7 +260,7 @@ def test_multiple_output_workers(self): config = ( PPOConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .training(train_batch_size=500) .evaluation(off_policy_estimation_methods={}) ) diff --git a/rllib/tests/test_local.py b/rllib/tests/test_local.py index bfa5dc5141ad..7664a8158cff 100644 --- a/rllib/tests/test_local.py +++ b/rllib/tests/test_local.py @@ -16,7 +16,7 @@ def test_local(self): config = ( PPOConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .training(model={"fcnet_hiddens": [10]}) ) diff --git a/rllib/tests/test_lstm.py b/rllib/tests/test_lstm.py index 70f5bf87e25b..245d3db9b055 100644 --- a/rllib/tests/test_lstm.py +++ b/rllib/tests/test_lstm.py @@ -180,7 +180,7 @@ def test_simple_optimizer_sequencing(self): PPOConfig() .environment("counter") .framework("tf") - .rollouts(num_rollout_workers=0, rollout_fragment_length=10) + .env_runners(num_env_runners=0, rollout_fragment_length=10) .training( train_batch_size=10, sgd_minibatch_size=10, @@ -251,7 +251,7 @@ def test_minibatch_sequencing(self): PPOConfig() .environment("counter") .framework("tf") - .rollouts(num_rollout_workers=0, rollout_fragment_length=20) + .env_runners(num_env_runners=0, rollout_fragment_length=20) .training( train_batch_size=20, sgd_minibatch_size=10, diff --git a/rllib/tests/test_model_imports.py b/rllib/tests/test_model_imports.py index 9fd761d0a657..fefbb9cb9d07 100644 --- a/rllib/tests/test_model_imports.py +++ b/rllib/tests/test_model_imports.py @@ -201,7 +201,7 @@ def test_ppo(self): config=( PPOConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) # We need to diable the RLModule / Learner API here, since this test is # overfitted to the ModelV2 API stack. .training( diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index 0c6439301aff..c2efe8adc818 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -410,7 +410,7 @@ def do_test_nested_dict(self, make_env, test_lstm=False): PPOConfig() .experimental(_disable_preprocessor_api=True) .environment("nested") - .rollouts(num_rollout_workers=0, rollout_fragment_length=5) + .env_runners(num_env_runners=0, rollout_fragment_length=5) .framework("tf") .training( model={"custom_model": "composite", "use_lstm": test_lstm}, @@ -444,7 +444,7 @@ def do_test_nested_tuple(self, make_env): PPOConfig() .experimental(_disable_preprocessor_api=True) .environment("nested2") - .rollouts(num_rollout_workers=0, rollout_fragment_length=5) + .env_runners(num_env_runners=0, rollout_fragment_length=5) .framework("tf") .training( model={"custom_model": "composite2"}, @@ -505,7 +505,7 @@ def test_torch_model(self): PPOConfig() .environment("nested") .framework("torch") - .rollouts(num_rollout_workers=0, rollout_fragment_length=5) + .env_runners(num_env_runners=0, rollout_fragment_length=5) .training( train_batch_size=5, sgd_minibatch_size=5, @@ -547,7 +547,7 @@ def test_torch_repeated(self): PPOConfig() .environment("repeat") .framework("torch") - .rollouts(num_rollout_workers=0, rollout_fragment_length=5) + .env_runners(num_env_runners=0, rollout_fragment_length=5) .training( train_batch_size=5, num_sgd_iter=1, diff --git a/rllib/tests/test_node_failure.py b/rllib/tests/test_node_failure.py index 7bfd5af40890..0ff3a7fde048 100644 --- a/rllib/tests/test_node_failure.py +++ b/rllib/tests/test_node_failure.py @@ -51,10 +51,10 @@ def test_continue_training_on_failure(self): config = ( PPOConfig() .environment("CartPole-v1") - .rollouts( - num_rollout_workers=6, + .env_runners( + num_env_runners=6, recreate_failed_workers=True, - validate_workers_after_construction=True, + validate_env_runners_after_construction=True, ) .training( train_batch_size=300, diff --git a/rllib/tests/test_pettingzoo_env.py b/rllib/tests/test_pettingzoo_env.py index da29aa2a021e..e77d42a89811 100644 --- a/rllib/tests/test_pettingzoo_env.py +++ b/rllib/tests/test_pettingzoo_env.py @@ -63,8 +63,8 @@ def env_creator(config): policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "av", ) .debugging(log_level="DEBUG") - .rollouts( - num_rollout_workers=1, + .env_runners( + num_env_runners=1, # Fragment length, collected at once from each worker # and for each agent! rollout_fragment_length=30, @@ -83,7 +83,7 @@ def test_pettingzoo_env(self): config = ( PPOConfig() .environment("simple_spread") - .rollouts(num_rollout_workers=0, rollout_fragment_length=30) + .env_runners(num_env_runners=0, rollout_fragment_length=30) .debugging(log_level="DEBUG") .training(train_batch_size=200) .multi_agent( diff --git a/rllib/tests/test_placement_groups.py b/rllib/tests/test_placement_groups.py index b7a8731e7f20..46fffcdcf867 100644 --- a/rllib/tests/test_placement_groups.py +++ b/rllib/tests/test_placement_groups.py @@ -39,7 +39,7 @@ def test_overriding_default_resource_request(self): model={"fcnet_hiddens": [10]}, lr=tune.grid_search([0.1, 0.01, 0.001]) ) .environment("CartPole-v1") - .rollouts(num_rollout_workers=2) + .env_runners(num_env_runners=2) .framework("tf") ) @@ -71,8 +71,8 @@ def default_resource_request(cls, config): def test_default_resource_request(self): config = ( PPOConfig() - .rollouts( - num_rollout_workers=2, + .env_runners( + num_env_runners=2, ) .training( model={"fcnet_hiddens": [10]}, lr=tune.grid_search([0.1, 0.01, 0.001]) @@ -99,7 +99,7 @@ def test_default_resource_request_plus_manual_leads_to_error(self): PPOConfig() .training(model={"fcnet_hiddens": [10]}) .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) ) try: diff --git a/rllib/tests/test_rllib_train_and_evaluate.py b/rllib/tests/test_rllib_train_and_evaluate.py index e6108b52961d..3bbe33a16a5a 100644 --- a/rllib/tests/test_rllib_train_and_evaluate.py +++ b/rllib/tests/test_rllib_train_and_evaluate.py @@ -183,7 +183,7 @@ def policy_fn(agent_id, episode, **kwargs): .get_default_config() .environment(MultiAgentCartPole) .framework(fw) - .rollouts(num_rollout_workers=1) + .env_runners(num_env_runners=1) .multi_agent( policies={"pol0", "pol1"}, policy_mapping_fn=policy_fn, diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index e342a55fb0b2..0736a3348917 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -67,7 +67,7 @@ def test_ppo_multiagent(self): "PPO", ( PPOConfig() - .rollouts(num_rollout_workers=1, rollout_fragment_length=10) + .env_runners(num_env_runners=1, rollout_fragment_length=10) .training(num_sgd_iter=1, train_batch_size=10, sgd_minibatch_size=1) ), ) @@ -98,7 +98,7 @@ def test_sac_multiagent(self): ( SACConfig() .environment(normalize_actions=False) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training(replay_buffer_config={"capacity": 1000}) ), ) diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 8451931231cf..12d4d8bf2df9 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -71,7 +71,7 @@ def tearDownClass(cls) -> None: def test_ppo(self): config = ( PPOConfig() - .rollouts(num_rollout_workers=2, rollout_fragment_length=50) + .env_runners(num_env_runners=2, rollout_fragment_length=50) .training( train_batch_size=100, num_sgd_iter=1, @@ -100,7 +100,7 @@ def test_ppo_no_preprocessors_gpu(self): # obscure errors. config = ( PPOConfig() - .rollouts(num_rollout_workers=2, rollout_fragment_length=50) + .env_runners(num_env_runners=2, rollout_fragment_length=50) .training( train_batch_size=100, num_sgd_iter=1, diff --git a/rllib/tests/test_timesteps.py b/rllib/tests/test_timesteps.py index 77f78c3a5145..9725bb22ebdb 100644 --- a/rllib/tests/test_timesteps.py +++ b/rllib/tests/test_timesteps.py @@ -22,7 +22,7 @@ def test_timesteps(self): ppo.PPOConfig() .experimental(_disable_preprocessor_api=True) .environment(RandomEnv) - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( model={ "fcnet_hiddens": [1], diff --git a/rllib/tuned_examples/appo/cartpole-appo-fake-gpus.yaml b/rllib/tuned_examples/appo/cartpole-appo-fake-gpus.yaml index 73581ac2b267..a8e3b47053ce 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-fake-gpus.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-fake-gpus.yaml @@ -7,7 +7,7 @@ cartpole-appo-vtrace-fake-gpus: config: # Works for both torch and tf. framework: torch - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 num_workers: 1 observation_filter: MeanStdFilter num_sgd_iter: 6 diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py index a3637d215358..dcce4afc042b 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py +++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py @@ -26,8 +26,8 @@ "vf_share_layers": False, }, ) - .rollouts( - num_envs_per_worker=5, + .env_runners( + num_envs_per_env_runner=5, num_rollout_workers=1, observation_filter="MeanStdFilter", ) diff --git a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml index f706d696a550..a11ecb312fe4 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml @@ -10,7 +10,7 @@ cartpole-appo-w-rl-modules-and-learner: # Works for both torch and tf. framework: torch - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 num_workers: 1 num_gpus: 0 observation_filter: MeanStdFilter diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/appo/cartpole-appo.yaml index b6785c0d3eb0..921bd7fcb546 100644 --- a/rllib/tuned_examples/appo/cartpole-appo.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo.yaml @@ -7,7 +7,7 @@ cartpole-appo: config: # Works for both torch and tf. framework: torch - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 num_workers: 4 num_gpus: 0 observation_filter: MeanStdFilter diff --git a/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py index 786f669376d6..6f26c307c2b7 100644 --- a/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -34,16 +34,16 @@ "stall_on_worker_indices": [2, 3], }, ) - .rollouts( - num_rollout_workers=1, - num_envs_per_worker=1, + .env_runners( + num_env_runners=1, + num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). .fault_tolerance( recreate_failed_workers=True, ) .evaluation( - evaluation_num_workers=4, + evaluation_num_env_runners=4, evaluation_interval=1, evaluation_duration=25, evaluation_duration_unit="episodes", diff --git a/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py index 3bb04eebf0a6..2b1c72c37e99 100644 --- a/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/cartpole-crashing-recreate-workers-appo.py @@ -29,16 +29,16 @@ "crash_on_worker_indices": [1, 2], }, ) - .rollouts( - num_rollout_workers=3, - num_envs_per_worker=1, + .env_runners( + num_env_runners=3, + num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). .fault_tolerance( recreate_failed_workers=True, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_duration=25, evaluation_duration_unit="episodes", diff --git a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml index 52587329d163..1ae6ef283c4a 100644 --- a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml +++ b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml @@ -25,7 +25,7 @@ frozenlake-appo-vtrace: batch_mode: complete_episodes vtrace: true - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 num_workers: 4 num_gpus: 0 num_sgd_iter: 1 diff --git a/rllib/tuned_examples/appo/halfcheetah-appo.yaml b/rllib/tuned_examples/appo/halfcheetah-appo.yaml index b423e0edf971..39a518a17dda 100644 --- a/rllib/tuned_examples/appo/halfcheetah-appo.yaml +++ b/rllib/tuned_examples/appo/halfcheetah-appo.yaml @@ -18,7 +18,7 @@ halfcheetah-appo: broadcast_interval: 1 max_sample_requests_in_flight_per_worker: 1 num_multi_gpu_tower_stacks: 1 - num_envs_per_worker: 32 + num_envs_per_env_runner: 32 minibatch_buffer_size: 16 num_sgd_iter: 32 clip_param: 0.2 diff --git a/rllib/tuned_examples/appo/memory-leak-test-appo.yaml b/rllib/tuned_examples/appo/memory-leak-test-appo.yaml index ca6f4008039c..466357153e64 100644 --- a/rllib/tuned_examples/appo/memory-leak-test-appo.yaml +++ b/rllib/tuned_examples/appo/memory-leak-test-appo.yaml @@ -10,5 +10,5 @@ memory-leak-test-appo: config: static_samples: true num_workers: 4 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 rollout_fragment_length: 20 diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py index 83c76d38a259..64fc0b5f1a2d 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -35,8 +35,8 @@ "stall_on_worker_indices": [2, 3], }, ) - .rollouts( - num_rollout_workers=3, + .env_runners( + num_env_runners=3, num_envs_per_worker=1, ) # Switch on resiliency (recreate any failed worker). @@ -44,7 +44,7 @@ recreate_failed_workers=True, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_duration=25, evaluation_duration_unit="episodes", diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py index 312aae1aa83f..cd17153639e0 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py @@ -30,16 +30,16 @@ "p_crash_reset": 0.005, # prob to crash during reset() }, ) - .rollouts( + .env_runners( num_rollout_workers=4, - num_envs_per_worker=1, + num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). .fault_tolerance( recreate_failed_workers=True, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_duration=25, evaluation_duration_unit="episodes", diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py index 0d13d26241dc..2444090b91f8 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py @@ -12,15 +12,15 @@ # Number of those policies that should be trained. These are a subset of `num_policies`. num_trainable = 10 -num_envs_per_worker = 5 +num_envs_per_env_runner = 5 # Define the config as an APPOConfig object. config = ( APPOConfig() .environment("multi_cartpole") - .rollouts( - num_rollout_workers=4, - num_envs_per_worker=num_envs_per_worker, + .env_runners( + num_env_runners=4, + num_envs_per_env_runner=num_envs_per_env_runner, observation_filter="MeanStdFilter", ) .training( @@ -37,7 +37,7 @@ # 2 agents per sub-env. # This is to avoid excessive swapping during an episode rollout, since # Policies are only re-picked at the beginning of each episode. - policy_map_capacity=2 * num_envs_per_worker, + policy_map_capacity=2 * num_envs_per_env_runner, policy_states_are_swappable=True, policies={f"pol{i}" for i in range(num_policies)}, # Train only the first n policies. @@ -61,7 +61,7 @@ + str(0 if aid == 0 else np.random.randint(num_trainable, num_policies)) ), ), - evaluation_num_workers=2, + evaluation_num_env_runners=2, evaluation_interval=1, evaluation_parallel_to_training=True, ) diff --git a/rllib/tuned_examples/appo/multi_agent_cartpole_appo.py b/rllib/tuned_examples/appo/multi_agent_cartpole_appo.py index 69c1aca82309..240fee9eff09 100644 --- a/rllib/tuned_examples/appo/multi_agent_cartpole_appo.py +++ b/rllib/tuned_examples/appo/multi_agent_cartpole_appo.py @@ -8,9 +8,9 @@ config = ( APPOConfig() .environment("env", env_config={"num_agents": 4}) - .rollouts( - num_envs_per_worker=5, - num_rollout_workers=4, + .env_runners( + num_envs_per_env_runner=5, + num_env_runners=4, observation_filter="MeanStdFilter", ) .resources(num_gpus=1, _fake_gpus=True) diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml index 43978fb1fd3c..4d412735d1c8 100644 --- a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml @@ -24,7 +24,7 @@ appo-pongnoframeskip-v5: num_workers: 31 broadcast_interval: 1 max_sample_requests_in_flight_per_worker: 1 - num_envs_per_worker: 8 + num_envs_per_env_runner: 8 num_sgd_iter: 2 vf_loss_coeff: 1.0 clip_param: 0.3 diff --git a/rllib/tuned_examples/appo/pong-appo.yaml b/rllib/tuned_examples/appo/pong-appo.yaml index 8614de9c3cfc..e680f24d8809 100644 --- a/rllib/tuned_examples/appo/pong-appo.yaml +++ b/rllib/tuned_examples/appo/pong-appo.yaml @@ -25,7 +25,7 @@ pong-appo: broadcast_interval: 1 max_sample_requests_in_flight_per_worker: 1 num_multi_gpu_tower_stacks: 1 - num_envs_per_worker: 8 + num_envs_per_env_runner: 8 minibatch_buffer_size: 4 num_sgd_iter: 2 vf_loss_coeff: 1.0 diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py index 313f5c2ea03f..f66f23649649 100644 --- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py @@ -8,7 +8,7 @@ .experimental(_enable_new_api_stack=False) .environment(StatelessCartPole) .resources(num_gpus=0) - .rollouts(num_rollout_workers=1, observation_filter="MeanStdFilter") + .env_runners(num_env_runners=1, observation_filter="MeanStdFilter") .training( lr=0.0003, num_sgd_iter=6, diff --git a/rllib/tuned_examples/bc/cartpole-bc.yaml b/rllib/tuned_examples/bc/cartpole-bc.yaml index 103a0e71e4b1..b88505e4a380 100644 --- a/rllib/tuned_examples/bc/cartpole-bc.yaml +++ b/rllib/tuned_examples/bc/cartpole-bc.yaml @@ -12,7 +12,7 @@ cartpole-bc: framework: torch # In order to evaluate on an actual environment, use these following # settings: - evaluation_num_workers: 1 + evaluation_num_env_runners: 1 evaluation_interval: 1 evaluation_config: input: sampler diff --git a/rllib/tuned_examples/compact-regression-test.yaml b/rllib/tuned_examples/compact-regression-test.yaml index d4e9941e491b..862fbb0c60f3 100644 --- a/rllib/tuned_examples/compact-regression-test.yaml +++ b/rllib/tuned_examples/compact-regression-test.yaml @@ -17,7 +17,7 @@ atari-impala: rollout_fragment_length: 50 train_batch_size: 500 num_workers: 10 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 clip_rewards: True lr_schedule: [ [0, 0.0005], @@ -44,7 +44,7 @@ atari-ppo-tf: sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 10 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 batch_mode: truncate_episodes observation_filter: NoFilter model: @@ -71,7 +71,7 @@ atari-ppo-torch: sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 10 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 batch_mode: truncate_episodes observation_filter: NoFilter model: @@ -103,7 +103,7 @@ apex: capacity: 1000000 num_gpus: 1 num_workers: 8 - num_envs_per_worker: 8 + num_envs_per_env_runner: 8 rollout_fragment_length: 20 train_batch_size: 512 target_network_update_freq: 50000 @@ -120,7 +120,7 @@ atari-a2c: rollout_fragment_length: 20 clip_rewards: True num_workers: 5 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 num_gpus: 1 lr_schedule: [ [0, 0.0007], diff --git a/rllib/tuned_examples/cql/pendulum-cql.yaml b/rllib/tuned_examples/cql/pendulum-cql.yaml index 5ba42e4b7a8c..16f2ecd98db7 100644 --- a/rllib/tuned_examples/cql/pendulum-cql.yaml +++ b/rllib/tuned_examples/cql/pendulum-cql.yaml @@ -32,7 +32,7 @@ pendulum-cql: # Evaluate in an actual environment. evaluation_interval: 1 - evaluation_num_workers: 2 + evaluation_num_env_runners: 2 evaluation_duration: 10 evaluation_parallel_to_training: true evaluation_config: diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index 746216fdb1a1..c8e26ab4763f 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -298,7 +298,7 @@ def stop_all(self): ) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, env_runner_cls=SingleAgentEnvRunner, @@ -346,7 +346,7 @@ def stop_all(self): .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ "explore": False, diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 961493102c81..8ee3937c13cc 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -291,7 +291,7 @@ def stop_all(self): ) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, env_runner_cls=SingleAgentEnvRunner, @@ -344,7 +344,7 @@ def stop_all(self): .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ "explore": False, diff --git a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py index 8d73723b59e5..bd2ec7602f07 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py @@ -6,7 +6,7 @@ .environment(env="CartPole-v1") .framework(framework="torch") .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, ) @@ -40,7 +40,7 @@ .evaluation( evaluation_interval=1, evaluation_parallel_to_training=True, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_duration="auto", evaluation_config={ "explore": False, diff --git a/rllib/tuned_examples/dreamerv3/atari_100k.py b/rllib/tuned_examples/dreamerv3/atari_100k.py index c6442cc73e06..5f5829f39a62 100644 --- a/rllib/tuned_examples/dreamerv3/atari_100k.py +++ b/rllib/tuned_examples/dreamerv3/atari_100k.py @@ -41,10 +41,10 @@ num_gpus_per_learner_worker=1 if num_gpus else 0, num_cpus_for_local_worker=1, ) - .rollouts( + .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - num_envs_per_worker=(num_gpus or 1), + num_envs_per_env_runner=(num_gpus or 1), remote_worker_envs=True, ) .reporting( diff --git a/rllib/tuned_examples/dreamerv3/atari_200M.py b/rllib/tuned_examples/dreamerv3/atari_200M.py index b74fa71dd0cc..fdbb31d94aa2 100644 --- a/rllib/tuned_examples/dreamerv3/atari_200M.py +++ b/rllib/tuned_examples/dreamerv3/atari_200M.py @@ -26,10 +26,10 @@ # if you don't have enough CPUs. num_cpus_for_local_worker=8 * (num_gpus or 1), ) - .rollouts( + .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - num_envs_per_worker=8 * (num_gpus or 1), + num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True, ) .environment( diff --git a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py index 133c4807c92f..4e262c9e0e5d 100644 --- a/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py +++ b/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py @@ -26,7 +26,7 @@ num_gpus_per_learner_worker=1 if num_gpus else 0, num_cpus_for_local_worker=1, ) - .rollouts(num_envs_per_worker=4 * (num_gpus or 1), remote_worker_envs=True) + .env_runners(num_envs_per_env_runner=4 * (num_gpus or 1), remote_worker_envs=True) .reporting( metrics_num_episodes_for_smoothing=(num_gpus or 1), report_images_and_videos=False, diff --git a/rllib/tuned_examples/dreamerv3/flappy_bird.py b/rllib/tuned_examples/dreamerv3/flappy_bird.py index a72b811f007c..1adbbeac44c5 100644 --- a/rllib/tuned_examples/dreamerv3/flappy_bird.py +++ b/rllib/tuned_examples/dreamerv3/flappy_bird.py @@ -49,10 +49,10 @@ def _env_creator(ctx): num_gpus_per_learner_worker=1 if num_gpus else 0, num_cpus_for_local_worker=1, ) - .rollouts( + .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - num_envs_per_worker=8 * (num_gpus or 1), + num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True, ) .reporting( diff --git a/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py b/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py index 56e2cd36969d..2de15805451f 100644 --- a/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py +++ b/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py @@ -45,8 +45,8 @@ ) # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - .rollouts( - num_envs_per_worker=8 * (num_gpus or 1), remote_worker_envs=True + .env_runners( + num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True ).reporting( metrics_num_episodes_for_smoothing=(num_gpus or 1), report_images_and_videos=False, diff --git a/rllib/tuned_examples/dreamerv3/highway_env.py b/rllib/tuned_examples/dreamerv3/highway_env.py index 77609f782f27..b96562fb22ea 100644 --- a/rllib/tuned_examples/dreamerv3/highway_env.py +++ b/rllib/tuned_examples/dreamerv3/highway_env.py @@ -42,10 +42,10 @@ num_gpus_per_learner_worker=1 if num_gpus else 0, num_cpus_for_local_worker=1, ) - .rollouts( + .env_runners( # If we use >1 GPU and increase the batch size accordingly, we should also # increase the number of envs per worker. - num_envs_per_worker=8 * (num_gpus or 1), + num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True, ) .reporting( diff --git a/rllib/tuned_examples/impala/atari-impala-large.yaml b/rllib/tuned_examples/impala/atari-impala-large.yaml index 8e8a882e84c7..a71c30b6d15a 100644 --- a/rllib/tuned_examples/impala/atari-impala-large.yaml +++ b/rllib/tuned_examples/impala/atari-impala-large.yaml @@ -19,7 +19,7 @@ atari-impala: rollout_fragment_length: 50 train_batch_size: 500 num_workers: 128 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 clip_rewards: True lr_schedule: [ [0, 0.0005], diff --git a/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml b/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml index 35568b1092b4..c74daf2d54df 100644 --- a/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml +++ b/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml @@ -16,7 +16,7 @@ atari-impala: num_gpus: 4 num_workers: 31 num_gpus_per_worker: 0 # works also for partial GPUs (<1.0) per worker - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 clip_rewards: True lr_schedule: [ [0, 0.0005], diff --git a/rllib/tuned_examples/impala/atari-impala.yaml b/rllib/tuned_examples/impala/atari-impala.yaml index 5c5a4d8fed9b..86b3bcb39852 100644 --- a/rllib/tuned_examples/impala/atari-impala.yaml +++ b/rllib/tuned_examples/impala/atari-impala.yaml @@ -17,7 +17,7 @@ atari-impala: rollout_fragment_length: 50 train_batch_size: 500 num_workers: 32 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 clip_rewards: True lr_schedule: [ [0, 0.0005], diff --git a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py index ef3754704234..c0a3c12ca4c1 100644 --- a/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py +++ b/rllib/tuned_examples/impala/cartpole-impala-separate-losses.py @@ -26,9 +26,9 @@ "vf_share_layers": False, }, ) - .rollouts( - num_envs_per_worker=5, - num_rollout_workers=1, + .env_runners( + num_envs_per_env_runner=5, + num_env_runners=1, observation_filter="MeanStdFilter", ) .resources(num_gpus=0) diff --git a/rllib/tuned_examples/impala/memory-leak-test-impala.yaml b/rllib/tuned_examples/impala/memory-leak-test-impala.yaml index 49b644e2c9db..56870e14f29d 100644 --- a/rllib/tuned_examples/impala/memory-leak-test-impala.yaml +++ b/rllib/tuned_examples/impala/memory-leak-test-impala.yaml @@ -14,4 +14,4 @@ memory-leak-test-impala: num_gpus: 0 num_workers: 3 num_aggregation_workers: 1 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 diff --git a/rllib/tuned_examples/impala/multi_agent_cartpole_impala.py b/rllib/tuned_examples/impala/multi_agent_cartpole_impala.py index d07fcd05acf1..78570bad5bb3 100644 --- a/rllib/tuned_examples/impala/multi_agent_cartpole_impala.py +++ b/rllib/tuned_examples/impala/multi_agent_cartpole_impala.py @@ -8,9 +8,9 @@ config = ( ImpalaConfig() .environment("env", env_config={"num_agents": 4}) - .rollouts( - num_envs_per_worker=5, - num_rollout_workers=4, + .env_runners( + num_envs_per_env_runner=5, + num_env_runners=4, observation_filter="MeanStdFilter", ) .resources(num_gpus=1, _fake_gpus=True) diff --git a/rllib/tuned_examples/impala/pong-impala-fast.yaml b/rllib/tuned_examples/impala/pong-impala-fast.yaml index d038f207af6c..36aad815c0d8 100644 --- a/rllib/tuned_examples/impala/pong-impala-fast.yaml +++ b/rllib/tuned_examples/impala/pong-impala-fast.yaml @@ -15,7 +15,7 @@ pong-impala-fast: rollout_fragment_length: 50 train_batch_size: 1000 num_workers: 128 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 broadcast_interval: 5 max_sample_requests_in_flight_per_worker: 1 num_multi_gpu_tower_stacks: 4 diff --git a/rllib/tuned_examples/impala/pong-impala-vectorized.yaml b/rllib/tuned_examples/impala/pong-impala-vectorized.yaml index 9623bd8d1e27..d00050c141b8 100644 --- a/rllib/tuned_examples/impala/pong-impala-vectorized.yaml +++ b/rllib/tuned_examples/impala/pong-impala-vectorized.yaml @@ -13,4 +13,4 @@ pong-impala-vectorized: rollout_fragment_length: 50 train_batch_size: 500 num_workers: 32 - num_envs_per_worker: 10 + num_envs_per_env_runner: 10 diff --git a/rllib/tuned_examples/impala/pong-impala.yaml b/rllib/tuned_examples/impala/pong-impala.yaml index b003be9b850e..af7587190e58 100644 --- a/rllib/tuned_examples/impala/pong-impala.yaml +++ b/rllib/tuned_examples/impala/pong-impala.yaml @@ -15,4 +15,4 @@ pong-impala: rollout_fragment_length: 50 train_batch_size: 500 num_workers: 128 - num_envs_per_worker: 1 + num_envs_per_env_runner: 1 diff --git a/rllib/tuned_examples/marwil/cartpole-marwil.yaml b/rllib/tuned_examples/marwil/cartpole-marwil.yaml index 09d3dd36526c..ed9fa30e7cad 100644 --- a/rllib/tuned_examples/marwil/cartpole-marwil.yaml +++ b/rllib/tuned_examples/marwil/cartpole-marwil.yaml @@ -12,7 +12,7 @@ cartpole-marwil: framework: torch # In order to evaluate on an actual environment, use these following # settings: - evaluation_num_workers: 1 + evaluation_num_env_runners: 1 evaluation_interval: 1 evaluation_config: input: sampler diff --git a/rllib/tuned_examples/ppo/atari-ppo.yaml b/rllib/tuned_examples/ppo/atari-ppo.yaml index 22a024da04d2..50df7bdcc50f 100644 --- a/rllib/tuned_examples/ppo/atari-ppo.yaml +++ b/rllib/tuned_examples/ppo/atari-ppo.yaml @@ -27,7 +27,7 @@ atari-ppo: sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 10 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 batch_mode: truncate_episodes observation_filter: NoFilter model: diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index bcb0219ae382..f05a63c55319 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -81,7 +81,7 @@ def stop_all(self): .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, # Following the paper. num_rollout_workers=32, @@ -116,7 +116,7 @@ def stop_all(self): .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ "explore": True, diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index dcee12989429..a7a21431872b 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -71,7 +71,7 @@ .environment(env=env) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=num_rollout_workers, @@ -117,7 +117,7 @@ .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ # PPO learns stochastic policy. diff --git a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py index 4971289c9850..c627fabc5a84 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py @@ -6,9 +6,9 @@ PPOConfig() # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=1, + num_env_runners=1, ) .environment("CartPole-v1") .rl_module( @@ -25,7 +25,7 @@ vf_loss_coeff=0.01, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_parallel_to_training=True, ) diff --git a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py index fc41a7fb86bd..a91c518eb047 100644 --- a/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py +++ b/rllib/tuned_examples/ppo/cartpole_truncated_ppo.py @@ -14,12 +14,12 @@ config = ( PPOConfig() .environment("cartpole_truncated") - .rollouts(num_envs_per_worker=10) + .env_runners(num_envs_per_env_runner=10) # For evaluation, use the "real" CartPole-v1 env (up to 500 steps). .evaluation( evaluation_config=PPOConfig.overrides(env="CartPole-v1"), evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, ) ) diff --git a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml index 8e442f6a0492..92a43a44296e 100644 --- a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml +++ b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml @@ -19,7 +19,7 @@ halfcheetah-ppo: num_workers: 16 num_gpus: 1 grad_clip: 0.5 - num_envs_per_worker: + num_envs_per_env_runner: grid_search: [16, 32] batch_mode: truncate_episodes observation_filter: MeanStdFilter diff --git a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml index 18b0422a4d1a..2b4a316c0b6a 100644 --- a/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml +++ b/rllib/tuned_examples/ppo/memory-leak-test-ppo.yaml @@ -10,7 +10,7 @@ memory-leak-test-ppo: config: static_samples: true num_workers: 4 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 train_batch_size: 500 sgd_minibatch_size: 256 num_sgd_iter: 5 diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index 6c33fb71194d..bd9db73545e7 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -8,10 +8,10 @@ .experimental(_enable_new_api_stack=True) # Switch off np.random, which is known to have memory leaks. .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=4, - num_envs_per_worker=5, + num_env_runners=4, + num_envs_per_env_runner=5, ) .training(train_batch_size=500, sgd_minibatch_size=256, num_sgd_iter=5) ) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py index 8475cab08b5b..a83f5aea1581 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py @@ -10,10 +10,10 @@ PPOConfig() .experimental(_enable_new_api_stack=True) .environment("multi_agent_pendulum") - .rollouts( + .env_runners( env_runner_cls=MultiAgentEnvRunner, - num_envs_per_worker=1, - num_rollout_workers=4, + num_envs_per_env_runner=1, + num_env_runners=4, ) .training( train_batch_size=512, diff --git a/rllib/tuned_examples/ppo/pendulum-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-ppo.yaml index ad0b22fd077c..4dca4402975a 100644 --- a/rllib/tuned_examples/ppo/pendulum-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-ppo.yaml @@ -11,7 +11,7 @@ pendulum-ppo: train_batch_size: 512 vf_clip_param: 10.0 num_workers: 0 - num_envs_per_worker: 20 + num_envs_per_env_runner: 20 lambda: 0.1 gamma: 0.95 lr: 0.0003 diff --git a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml index 5763dfad22f3..ce0472bcb33d 100644 --- a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml @@ -18,7 +18,7 @@ pendulum-ppo: normalize_actions: true clip_actions: false vf_clip_param: 10.0 - num_envs_per_worker: 20 + num_envs_per_env_runner: 20 lambda: 0.1 gamma: 0.95 lr: 0.0003 diff --git a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py index 5ad049d7aadd..45f52388d325 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py @@ -6,10 +6,10 @@ PPOConfig() # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=2, - num_envs_per_worker=20, + num_env_runners=2, + num_envs_per_env_runner=20, ) .environment("Pendulum-v1") .training( @@ -25,7 +25,7 @@ }, ) .evaluation( - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_interval=1, evaluation_parallel_to_training=True, ) diff --git a/rllib/tuned_examples/ppo/pong-ppo.yaml b/rllib/tuned_examples/ppo/pong-ppo.yaml index 275eaabdbe4c..43f61424bbd2 100644 --- a/rllib/tuned_examples/ppo/pong-ppo.yaml +++ b/rllib/tuned_examples/ppo/pong-ppo.yaml @@ -24,7 +24,7 @@ pong-ppo: sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 32 - num_envs_per_worker: 5 + num_envs_per_env_runner: 5 batch_mode: truncate_episodes observation_filter: NoFilter num_gpus: 1 diff --git a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml b/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml index 47bae28fb49a..72955b9ee5aa 100644 --- a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml +++ b/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml @@ -43,7 +43,7 @@ recomm-sys001-ppo: # Evaluation settings. evaluation_interval: 1 - evaluation_num_workers: 4 + evaluation_num_env_runners: 4 evaluation_duration: 200 evaluation_duration_unit: episodes evaluation_parallel_to_training: true diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml index 56bb35672786..cc985404fae7 100644 --- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml +++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml @@ -15,7 +15,7 @@ repeat-after-me-ppo-w-lstm: gamma: 0.9 lr: 0.0003 num_workers: 0 - num_envs_per_worker: 20 + num_envs_per_env_runner: 20 num_sgd_iter: 5 entropy_coeff: 0.00001 model: diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index 2ce6c877daf6..7f33a20b3913 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -69,7 +69,7 @@ def stop_all(self): .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, @@ -112,7 +112,7 @@ def stop_all(self): .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ "explore": False, diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 43dfa1a2b8d7..414b94833a5e 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -59,7 +59,7 @@ .environment(env=env) # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( rollout_fragment_length="auto", env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=1, @@ -103,7 +103,7 @@ .evaluation( evaluation_duration="auto", evaluation_interval=1, - evaluation_num_workers=1, + evaluation_num_env_runners=1, evaluation_parallel_to_training=True, evaluation_config={ "explore": False, diff --git a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py index 062ea8091cbf..b4da28c927df 100644 --- a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py +++ b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py @@ -5,10 +5,10 @@ SACConfig() # Enable new API stack and use EnvRunner. .experimental(_enable_new_api_stack=True) - .rollouts( + .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + num_env_runners=0, ) .environment(env="Pendulum-v1") .rl_module( diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py index 94f638a88b76..8e6b671eb80c 100644 --- a/rllib/utils/exploration/tests/test_curiosity.py +++ b/rllib/utils/exploration/tests/test_curiosity.py @@ -166,10 +166,8 @@ def test_curiosity_on_frozen_lake(self): .callbacks(MyCallBack) # Limit horizon to make it really hard for non-curious agent to reach # the goal state. - .rollouts(num_rollout_workers=0) - # TODO (Kourosh): We need to provide examples on how we do curiosity with - # RLModule API - .training(lr=0.001).exploration( + .env_runners( + num_env_runners=0, exploration_config={ "type": "Curiosity", "eta": 0.2, @@ -182,8 +180,11 @@ def test_curiosity_on_frozen_lake(self): "sub_exploration": { "type": "StochasticSampling", }, - } + }, ) + # TODO (Kourosh): We need to provide examples on how we do curiosity with + # RLModule API + .training(lr=0.001) ) num_iterations = 10 @@ -231,15 +232,9 @@ def test_curiosity_on_partially_observable_domain(self): "framestack": 1, # seems to work even w/o framestacking }, ) - .rollouts(num_envs_per_worker=4, num_rollout_workers=0) - .training( - model={ - "fcnet_hiddens": [256, 256], - "fcnet_activation": "relu", - }, - num_sgd_iter=8, - ) - .exploration( + .env_runners( + num_envs_per_env_runner=4, + num_env_runners=0, exploration_config={ "type": "Curiosity", # For the feature NN, use a non-LSTM fcnet (same as the one @@ -256,7 +251,14 @@ def test_curiosity_on_partially_observable_domain(self): "sub_exploration": { "type": "StochasticSampling", }, - } + }, + ) + .training( + model={ + "fcnet_hiddens": [256, 256], + "fcnet_activation": "relu", + }, + num_sgd_iter=8, ) ) diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index 0d14c044295a..8d15b74c751a 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -25,7 +25,7 @@ def do_test_explorations(config, dummy_obs, prev_a=None, expected_mean_action=No if local_config._enable_new_api_stack: # TODO(Artur): Support Random exploration with RL Modules. continue - local_config.exploration(exploration_config={"type": "Random"}) + local_config.env_runners(exploration_config={"type": "Random"}) print("exploration={}".format(exploration or "default")) algo = local_config.build() @@ -81,7 +81,7 @@ def tearDownClass(cls): def test_dqn(self): config = ( - dqn.DQNConfig().environment("CartPole-v1").rollouts(num_rollout_workers=0) + dqn.DQNConfig().environment("CartPole-v1").env_runners(num_env_runners=0) ) do_test_explorations( config, @@ -92,7 +92,7 @@ def test_impala(self): config = ( impala.ImpalaConfig() .environment("CartPole-v1") - .rollouts(num_rollout_workers=0) + .env_runners(num_env_runners=0) .resources(num_gpus=0) ) do_test_explorations( @@ -103,7 +103,7 @@ def test_impala(self): def test_ppo_discr(self): config = ( - ppo.PPOConfig().environment("CartPole-v1").rollouts(num_rollout_workers=0) + ppo.PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0) ) do_test_explorations( config, @@ -113,7 +113,7 @@ def test_ppo_discr(self): def test_ppo_cont(self): config = ( - ppo.PPOConfig().environment("Pendulum-v1").rollouts(num_rollout_workers=0) + ppo.PPOConfig().environment("Pendulum-v1").env_runners(num_env_runners=0) ) do_test_explorations( config, @@ -124,7 +124,7 @@ def test_ppo_cont(self): def test_sac(self): config = ( - sac.SACConfig().environment("Pendulum-v1").rollouts(num_rollout_workers=0) + sac.SACConfig().environment("Pendulum-v1").env_runners(num_env_runners=0) ) do_test_explorations( config, diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 1460decfc224..661c44edebc9 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1230,7 +1230,7 @@ def run_rllib_example_script_experiment( base_config: The AlgorithmConfig object to use for this experiment. This base config will be automatically "extended" based on some of the provided `args`. For example, `args.num_env_runners` is used to set - `config.num_rollout_workers`, etc.. + `config.num_env_runners`, etc.. args: A argparse.Namespace object, ideally returned by calling `args = add_rllib_example_script_args()`. It must have the following properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`, @@ -1282,8 +1282,8 @@ def run_rllib_example_script_experiment( # Enable the new API stack? .experimental(_enable_new_api_stack=args.enable_new_api_stack) # Define EnvRunner/RolloutWorker scaling and behavior. - .rollouts( - num_rollout_workers=args.num_env_runners, + .env_runners( + num_env_runners=args.num_env_runners, # Set up the correct env-runner to use depending on # old-stack/new-stack and multi-agent settings. env_runner_cls=( @@ -1527,7 +1527,7 @@ def check_reproducibilty( # new API num_gpus_per_learner_worker=int(os.environ.get("RLLIB_NUM_GPUS", "0")), ) - .rollouts(num_rollout_workers=num_workers, num_envs_per_worker=2) + .env_runners(num_rollout_workers=num_workers, num_envs_per_worker=2) ) for fw in framework_iterator(algo_config, **fw_kwargs): diff --git a/rllib/utils/tests/test_errors.py b/rllib/utils/tests/test_errors.py index e55d54f54272..f3f7f32a42e1 100644 --- a/rllib/utils/tests/test_errors.py +++ b/rllib/utils/tests/test_errors.py @@ -37,7 +37,7 @@ def test_no_gpus_error(self): def test_bad_envs(self): """Tests different "bad env" errors.""" config = ( - ppo.PPOConfig().rollouts(num_rollout_workers=0) + ppo.PPOConfig().env_runners(num_env_runners=0) # Non existing/non-registered gym env string. .environment("Alien-Attack-v42") ) From 090f301d52aae6a98ecd1fa1b3334cef63664907 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 23 Apr 2024 10:48:38 +0200 Subject: [PATCH 03/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 44 ++--- rllib/algorithms/algorithm_config.py | 158 +++++++++++------- rllib/algorithms/appo/appo.py | 12 +- rllib/algorithms/appo/appo_tf_policy.py | 4 +- rllib/algorithms/appo/appo_torch_policy.py | 2 +- .../appo/tests/test_appo_learner.py | 4 +- rllib/algorithms/appo/tf/appo_tf_learner.py | 2 +- .../appo/torch/appo_torch_learner.py | 2 +- rllib/algorithms/bc/bc.py | 4 +- rllib/algorithms/bc/tests/test_bc.py | 2 +- rllib/algorithms/dqn/dqn.py | 14 +- rllib/algorithms/dreamerv3/dreamerv3.py | 8 +- rllib/algorithms/impala/impala.py | 38 ++--- rllib/algorithms/impala/impala_tf_policy.py | 4 +- .../algorithms/impala/impala_torch_policy.py | 2 +- .../impala/tests/test_impala_learner.py | 2 +- .../tests/test_impala_off_policyness.py | 2 +- .../algorithms/impala/tf/impala_tf_learner.py | 2 +- .../impala/torch/impala_torch_learner.py | 2 +- rllib/algorithms/ppo/ppo.py | 22 +-- rllib/algorithms/ppo/ppo_learner.py | 2 +- rllib/algorithms/ppo/tests/test_ppo.py | 11 -- .../algorithms/ppo/tests/test_ppo_learner.py | 6 +- .../ppo/tests/test_ppo_with_env_runner.py | 2 +- .../ppo/tests/test_ppo_with_rl_module.py | 6 +- rllib/algorithms/sac/sac.py | 6 +- .../algorithms/tests/test_algorithm_config.py | 21 +-- .../tests/test_algorithm_export_checkpoint.py | 2 +- .../tests/test_callbacks_on_env_runner.py | 6 +- .../algorithms/tests/test_worker_failures.py | 26 +-- rllib/connectors/agent/state_buffer.py | 2 +- rllib/connectors/agent/view_requirement.py | 2 +- rllib/core/learner/learner.py | 2 +- rllib/core/models/tests/test_catalog.py | 2 +- rllib/core/testing/tests/test_bc_algorithm.py | 6 +- .../env/tests/test_multi_agent_env_runner.py | 2 +- rllib/evaluation/env_runner_v2.py | 4 +- rllib/evaluation/episode_v2.py | 2 +- rllib/evaluation/postprocessing.py | 2 +- rllib/evaluation/rollout_worker.py | 12 +- .../evaluation/tests/test_envs_that_crash.py | 4 +- .../tests/test_trajectory_view_api.py | 10 +- rllib/evaluation/worker_set.py | 2 +- .../policy/episode_env_aware_policy.py | 2 +- rllib/examples/action_masking.py | 6 +- ...raining_step_on_and_off_policy_combined.py | 3 +- rllib/examples/autoregressive_action_dist.py | 2 +- .../examples/catalogs/mobilenet_v2_encoder.py | 2 +- rllib/examples/checkpoints/onnx_tf.py | 2 +- rllib/examples/checkpoints/onnx_torch.py | 2 +- .../debugging/deterministic_training.py | 2 +- .../envs/external_envs/cartpole_server.py | 2 +- ...inference_after_training_with_attention.py | 2 +- ...licy_inference_after_training_with_lstm.py | 2 +- .../examples/learners/ppo_load_rl_modules.py | 2 +- .../learners/train_w_bc_finetune_w_ppo.py | 2 +- .../self_play_league_based_with_open_spiel.py | 4 +- .../multi_agent/self_play_with_open_spiel.py | 2 +- rllib/examples/multi_agent/two_algorithms.py | 2 +- .../ray_serve/ray_serve_with_rllib.py | 2 +- rllib/examples/ray_tune/custom_experiment.py | 4 +- rllib/examples/ray_tune/custom_logger.py | 2 +- .../ray_tune/custom_progress_reporter.py | 2 +- .../rl_modules/classes/mobilenet_rlm.py | 2 +- rllib/models/tests/test_preprocessors.py | 2 +- rllib/policy/eager_tf_policy.py | 6 +- rllib/policy/eager_tf_policy_v2.py | 22 +-- rllib/policy/policy.py | 18 +- .../tests/test_compute_log_likelihoods.py | 4 +- .../tests/test_export_checkpoint_and_model.py | 2 +- rllib/policy/tests/test_policy.py | 4 +- rllib/policy/tf_mixins.py | 14 +- rllib/policy/torch_mixins.py | 6 +- rllib/policy/torch_policy.py | 2 +- rllib/policy/torch_policy_v2.py | 44 ++--- .../tests/test_algorithm_rl_module_restore.py | 12 +- ..._algorithm_save_load_checkpoint_learner.py | 4 +- rllib/tests/test_rllib_train_and_evaluate.py | 2 +- ...artpole-appo-w-rl-modules-and-learner.yaml | 2 +- .../pong-appo-w-rl-modules-and-learner.yaml | 2 +- .../appo/stateless_cartpole_appo.py | 2 +- .../tuned_examples/dqn/benchmark_dqn_atari.py | 2 +- ...benchmark_dqn_atari_rllib_preprocessing.py | 2 +- .../dqn/cartpole_dqn_envrunner.py | 2 +- .../impala/cartpole-impala.yaml | 2 +- .../ppo/benchmark_ppo_mujoco.py | 2 +- .../ppo/benchmark_ppo_mujoco_pb2.py | 2 +- .../ppo/cartpole_ppo_envrunner.py | 2 +- .../ppo/memory_leak_test_ppo_new_stack.py | 2 +- .../ppo/multi_agent_pendulum_ppo_envrunner.py | 2 +- .../ppo/pendulum_ppo_envrunner.py | 2 +- .../sac/benchmark_sac_mujoco.py | 2 +- .../sac/benchmark_sac_mujoco_pb2.py | 2 +- .../sac/pendulum_sac_envrunner.py | 2 +- rllib/utils/checkpoints.py | 2 +- rllib/utils/debug/memory.py | 4 +- .../exploration/tests/test_explorations.py | 2 +- rllib/utils/test_utils.py | 14 +- 98 files changed, 366 insertions(+), 351 deletions(-) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index a1b41e7cf63f..d020ac9f84f5 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -710,7 +710,7 @@ def setup(self, config: AlgorithmConfig) -> None: method_config["type"] = method_type self.learner_group = None - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: local_worker = self.workers.local_worker() env = spaces = None # EnvRunners have a `module` property, which stores the RLModule @@ -760,7 +760,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Note that with the new EnvRunner API in combination with the new stack, # this information only needs to be kept in the Learner and not on the # EnvRunners anymore. - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: policies_to_train = self.config.policies_to_train or set( self.config.policies ) @@ -875,7 +875,7 @@ def step(self) -> ResultDict: # references). Then distribute the episode refs to the learners, store metrics # in special key in result dict and perform the connector merge/broadcast # inside the `training_step` as well. See the new IMPALA for an example. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Synchronize EnvToModule and ModuleToEnv connector states and broadcast new # states back to all EnvRunners. with self._timers[SYNCH_ENV_CONNECTOR_STATES_TIMER]: @@ -951,7 +951,7 @@ def evaluate( from_worker_or_learner_group=self.workers.local_worker() ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Synchronize EnvToModule and ModuleToEnv connector states and broadcast # new states back to all eval EnvRunners. with self._timers[SYNCH_EVAL_ENV_CONNECTOR_STATES_TIMER]: @@ -1104,7 +1104,7 @@ def _evaluate_on_local_env_runner(self, env_runner): logger.info(f"Evaluating current state of {self} for {duration} {unit}.") all_batches = [] - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: episodes = env_runner.sample( num_timesteps=duration if unit == "timesteps" else None, num_episodes=duration if unit == "episodes" else None, @@ -1179,7 +1179,7 @@ def _env_runner_remote(worker, num, round, iter): ): _round += 1 # New API stack -> EnvRunners return Episodes. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Compute rough number of timesteps it takes for a single EnvRunner # to occupy the estimated (parallelly running) train step. _num = min( @@ -1324,7 +1324,7 @@ def _env_runner_remote(worker, num, round, iter): _round += 1 - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: _num = [None] + [ (units_left_to_do // num_healthy_workers) + bool(i <= (units_left_to_do % num_healthy_workers)) @@ -1529,7 +1529,7 @@ def training_step(self) -> ResultDict: Returns: The results dict from executing the training iteration. """ - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: raise NotImplementedError( "The `Algorithm.training_step()` default implementation no longer " "supports the old or hybrid API stacks! If you would like to continue " @@ -1565,7 +1565,7 @@ def training_step(self) -> ResultDict: # cases should use the multi-GPU optimizer, even if only using 1 GPU). # TODO: (sven) rename MultiGPUOptimizer into something more # meaningful. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: train_results = self.learner_group.update_from_batch(batch=train_batch) elif self.config.get("simple_optimizer") is True: train_results = train_one_step(self, train_batch) @@ -1583,7 +1583,7 @@ def training_step(self) -> ResultDict: with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: # TODO (Avnish): Implement this on learner_group.get_weights(). from_worker_or_trainer = None - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: from_worker_or_trainer = self.learner_group self.workers.sync_weights( @@ -2024,7 +2024,7 @@ def add_policy( The newly added policy (the copy that got added to the local worker). If `workers` was provided, None is returned. """ - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: raise ValueError( "`Algorithm.add_policy()` is not supported on the new API stack w/ " "EnvRunners! Use `Algorithm.add_module()` instead. Also see " @@ -2049,7 +2049,7 @@ def add_policy( # If learner API is enabled, we need to also add the underlying module # to the learner group. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: policy = self.get_policy(policy_id) module = policy.model self.learner_group.add_module( @@ -2203,7 +2203,7 @@ def fn(worker): # Update each Learner's `policies_to_train` information, but only # if the arg is explicitly provided here. - if self.config._enable_new_api_stack and policies_to_train is not None: + if self.config.enable_rl_module_and_learner and policies_to_train is not None: self.learner_group.foreach_learner( lambda learner: learner.config.multi_agent( policies_to_train=policies_to_train @@ -2316,7 +2316,7 @@ def save_checkpoint(self, checkpoint_dir: str) -> None: policy_states = state["worker"].pop("policy_states", {}) # Add RLlib checkpoint version. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: state["checkpoint_version"] = CHECKPOINT_VERSION_LEARNER else: state["checkpoint_version"] = CHECKPOINT_VERSION @@ -2351,7 +2351,7 @@ def save_checkpoint(self, checkpoint_dir: str) -> None: policy.export_checkpoint(policy_dir, policy_state=policy_state) # if we are using the learner API, save the learner group state - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: learner_state_dir = os.path.join(checkpoint_dir, "learner") self.learner_group.save_state(learner_state_dir) @@ -2363,7 +2363,7 @@ def load_checkpoint(self, checkpoint_dir: str) -> None: checkpoint_info = get_checkpoint_info(checkpoint_dir) checkpoint_data = Algorithm._checkpoint_info_to_algorithm_state(checkpoint_info) self.__setstate__(checkpoint_data) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: learner_state_dir = os.path.join(checkpoint_dir, "learner") self.learner_group.load_state(learner_state_dir) # Make also sure, all training EnvRunners get the just loaded weights. @@ -2415,7 +2415,7 @@ def default_resource_request( eval_cf.freeze() # resources for the driver of this trainable - if cf._enable_new_api_stack: + if cf.enable_rl_module_and_learner: if cf.num_learner_workers == 0: # in this case local_worker only does sampling and training is done on # local learner worker @@ -2470,7 +2470,7 @@ def default_resource_request( # resources for remote learner workers learner_bundles = [] - if cf._enable_new_api_stack and cf.num_learner_workers > 0: + if cf.enable_rl_module_and_learner and cf.num_learner_workers > 0: learner_bundles = cls._get_learner_bundles(cf) bundles = [driver] + rollout_bundles + evaluation_bundles + learner_bundles @@ -2775,7 +2775,7 @@ def __getstate__(self) -> Dict: if ( hasattr(self, "evaluation_workers") and self.evaluation_workers is not None - and not self.config.uses_new_env_runners + and not self.config.enable_env_runner_and_connector_v2 ): state[ "eval_policy_mapping_fn" @@ -2850,12 +2850,12 @@ def _setup_eval_worker(w): "data found in state!" ) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if "learner_state_dir" in state: self.learner_group.load_state(state["learner_state_dir"]) else: logger.warning( - "You configured `_enable_new_api_stack=True`, but no " + "You configured `enable_rl_module_and_learner=True`, but no " "`learner_state_dir` key could be found in the state dict!" ) @@ -3011,7 +3011,7 @@ def _checkpoint_info_to_algorithm_state( ): worker_state["is_policy_to_train"] = policies_to_train - if state["config"]._enable_new_api_stack: + if state["config"].enable_rl_module_and_learner: state["learner_state_dir"] = os.path.join( checkpoint_info["checkpoint_dir"], "learner" ) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index d23eb0f0f056..28593a290764 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -319,6 +319,10 @@ def __init__(self, algo_class: Optional[type] = None): ) self.torch_compile_worker_dynamo_mode = None + # `self.api_stack()` + self.enable_rl_module_and_learner = False + self.enable_env_runner_and_connector_v2 = False + # `self.environment()` self.env = None self.env_config = {} @@ -512,7 +516,6 @@ def __init__(self, algo_class: Optional[type] = None): self.__prior_exploration_config = None # `self.experimental()` - self._enable_new_api_stack = False self._tf_policy_handles_more_than_one_loss = False self._disable_preprocessor_api = False self._disable_action_flattening = False @@ -524,6 +527,7 @@ def __init__(self, algo_class: Optional[type] = None): # TODO: Remove, once all deprecation_warning calls upon using these keys # have been removed. # === Deprecated keys === + self._enable_new_api_stack = DEPRECATED_VALUE self.evaluation_num_workers = DEPRECATED_VALUE self.simple_optimizer = DEPRECATED_VALUE self.monitor = DEPRECATED_VALUE @@ -660,10 +664,12 @@ def update_from_dict( # Namely, we want to re-instantiate the exploration config this config had # inside `self.experimental()` before potentially overwriting it in the # following. - if "_enable_new_api_stack" in config_dict: - self.experimental( - _enable_new_api_stack=config_dict["_enable_new_api_stack"] - ) + enable_rl_module_and_learner = config_dict.get( + "_enable_new_api_stack", + config_dict.get("enable_rl_module_and_learner"), + ) + if enable_rl_module_and_learner: + self.api_stack(enable_rl_module_and_learner=enable_rl_module_and_learner) # Modify our properties one by one. for key, value in config_dict.items(): @@ -674,7 +680,7 @@ def update_from_dict( if key == TRIAL_INFO: continue - if key == "_enable_new_api_stack": + if key in ["_enable_new_api_stack", "enable_rl_module_and_learner"]: # We've dealt with this above. continue # Set our multi-agent settings. @@ -707,7 +713,7 @@ def update_from_dict( elif key.startswith("evaluation_"): eval_call[key] = value elif key == "exploration_config": - if config_dict.get("_enable_new_api_stack", False): + if enable_rl_module_and_learner: self.exploration_config = value continue if isinstance(value, dict) and "type" in value: @@ -1398,6 +1404,50 @@ def framework( return self + + def api_stack( + self, + enable_rl_module_and_learner: Optional[str] = NotProvided, + enable_env_runner_and_connector_v2: Optional[str] = NotProvided, + ) -> "AlgorithmConfig": + """Sets the config's API stack settings. + + Args: + enable_rl_module_and_learner: Enables the usage of `RLModule` (instead of + `ModelV2`) and Learner (instead of the training-related parts of + `Policy`). If `enable_env_runner_and_connector_v2=False`, these two + classes (`RLModule` and `Learner`) will be used along with + `RolloutWorkers` and `Policy`. + enable_env_runner_and_connector_v2: Enables the usage of EnvRunners + (SingleAgentEnvRunner and MultiAgentEnvRunner) and ConnectorV2. + When setting this to True, `enable_rl_module_and_learner` must be True + as well. + + Returns: + This updated AlgorithmConfig object. + """ + if enable_rl_module_and_learner is not NotProvided: + self.enable_rl_module_and_learner = enable_rl_module_and_learner + + if enable_rl_module_and_learner is True and self.exploration_config: + self.__prior_exploration_config = self.exploration_config + self.exploration_config = {} + + elif enable_rl_module_and_learner is False and not self.exploration_config: + if self.__prior_exploration_config is not None: + self.exploration_config = self.__prior_exploration_config + self.__prior_exploration_config = None + else: + logger.warning( + "config.enable_rl_module_and_learner was set to False, but no " + "prior exploration config was found to be restored." + ) + + if enable_env_runner_and_connector_v2 is not NotProvided: + self.enable_env_runner_and_connector_v2 = enable_env_runner_and_connector_v2 + + return self + def environment( self, env: Optional[Union[str, EnvType]] = NotProvided, @@ -1925,7 +1975,7 @@ def training( full list of the available model options. TODO: Provide ModelConfig objects instead of dicts. optimizer: Arguments to pass to the policy optimizer. This setting is not - used when `_enable_new_api_stack=True`. + used when `enable_rl_module_and_learner=True`. max_requests_in_flight_per_sampler_worker: Max number of inflight requests to each sampling worker. See the FaultTolerantActorManager class for more details. @@ -1939,7 +1989,7 @@ def training( turn down the number of remote requests in flight, or enable compression in your experiment of timesteps. learner_class: The `Learner` class to use for (distributed) updating of the - RLModule. Only used when `_enable_new_api_stack=True`. + RLModule. Only used when `enable_rl_module_and_learner=True`. learner_connector: A callable taking an env observation space and an env action space as inputs and returning a learner ConnectorV2 (might be a pipeline) object. @@ -1988,8 +2038,8 @@ def training( deprecation_warning( old="AlgorithmConfig.training(_use_default_native_models=True)", help="_use_default_native_models is not supported " - "anymore. To get rid of this error, set `config.experimental(" - "_enable_new_api_stack=True)`. Native models will " + "anymore. To get rid of this error, set `config.api_stack(" + "enable_rl_module_and_learner=True)`. Native models will " "be better supported by the upcoming RLModule API.", # Error out if user tries to enable this. error=model["_use_default_native_models"], @@ -2104,8 +2154,9 @@ def evaluation( (default) will make sure that the evaluation results will not be polluted with episode statistics that were actually (at least partially) achieved with an earlier set of weights. Note that this setting is only - supported on the new API stack (`config._enable_new_api_stack=True` - and `config.env_runner_cls=[SingleAgentEnvRunner|MultiAgentEnvrunner]`). + supported on the new API stack w/ EnvRunners and ConnectorV2 + (`config.enable_rl_module_and_learner=True` AND + `config.enable_env_runner_and_connector_v2=True`). evaluation_config: Typical usage is to pass extra args to evaluation env creator and to disable exploration by computing deterministic actions. IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal @@ -2367,7 +2418,7 @@ def multi_agent( These tuples or PolicySpecs define the class of the policy, the observation- and action spaces of the policies, and any extra config. algorithm_config_overrides_per_module: Only used if - `_enable_new_api_stack=True`. + `enable_rl_module_and_learner=True`. A mapping from ModuleIDs to per-module AlgorithmConfig override dicts, which apply certain settings, e.g. the learning rate, from the main AlgorithmConfig only to this @@ -2785,8 +2836,8 @@ def rl_module( if _enable_rl_module_api is not NotProvided: deprecation_warning( - old="AlgorithmConfig.rl_module(_enable_rl_module_api=True|False)", - new="AlgorithmConfig.experimental(_enable_new_api_stack=True|False)", + old="AlgorithmConfig.rl_module(_enable_rl_module_api=..)", + new="AlgorithmConfig.api_stack(enable_rl_module_and_learner=..)", error=False, ) return self @@ -2794,20 +2845,16 @@ def rl_module( def experimental( self, *, - _enable_new_api_stack: Optional[bool] = NotProvided, _tf_policy_handles_more_than_one_loss: Optional[bool] = NotProvided, _disable_preprocessor_api: Optional[bool] = NotProvided, _disable_action_flattening: Optional[bool] = NotProvided, _disable_initialize_loss_from_dummy_batch: Optional[bool] = NotProvided, # Deprecated args. - _disable_execution_plan_api=None, + _enable_new_api_stack=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the config's experimental settings. Args: - _enable_new_api_stack: Enables the new API stack, which will use RLModule - (instead of ModelV2) as well as the multi-GPU capable Learner API - (instead of using Policy to compute loss and update the model). _tf_policy_handles_more_than_one_loss: Experimental flag. If True, TFPolicy will handle more than one loss/optimizer. Set this to True, if you would like to return more than @@ -2829,31 +2876,13 @@ def experimental( Returns: This updated AlgorithmConfig object. """ - if _disable_execution_plan_api is not None: + if _enable_new_api_stack != DEPRECATED_VALUE: deprecation_warning( - old="config.experimental(_disable_execution_plan_api=...)", - help="The execution plan API is no longer supported! Use subclassing " - "of the `Algorithm` class and override the " - "`Algorithm.training_step()` method instead.", - error=True, + old="config.experimental(_enable_new_api_stack=...)", + new="config.api_stack(enable_rl_module_and_learner=...)", + error=False, ) - - if _enable_new_api_stack is not NotProvided: - self._enable_new_api_stack = _enable_new_api_stack - - if _enable_new_api_stack is True and self.exploration_config: - self.__prior_exploration_config = self.exploration_config - self.exploration_config = {} - - elif _enable_new_api_stack is False and not self.exploration_config: - if self.__prior_exploration_config is not None: - self.exploration_config = self.__prior_exploration_config - self.__prior_exploration_config = None - else: - logger.warning( - "config._enable_new_api_stack was set to False, but no prior " - "exploration config was found to be restored." - ) + self.api_stack(enable_rl_module_and_learner=_enable_new_api_stack) if _tf_policy_handles_more_than_one_loss is not NotProvided: self._tf_policy_handles_more_than_one_loss = ( @@ -2936,12 +2965,6 @@ def is_atari(self) -> bool: return self._is_atari - @property - def uses_new_env_runners(self): - return self.env_runner_cls is not None and not issubclass( - self.env_runner_cls, RolloutWorker - ) - @property def total_train_batch_size(self): if self.train_batch_size_per_learner is not None: @@ -3777,11 +3800,11 @@ def _validate_framework_settings(self) -> None: _torch, _ = try_import_torch() # Can not use "tf" with learner API. - if self.framework_str == "tf" and self._enable_new_api_stack: + if self.framework_str == "tf" and self.enable_rl_module_and_learner: raise ValueError( "Cannot use `framework=tf` with the new API stack! Either switch to tf2" " via `config.framework('tf2')` OR disable the new API stack via " - "`config.experimental(_enable_new_api_stack=False)`." + "`config.api_stack(enable_rl_module_and_learner=False)`." ) # Check if torch framework supports torch.compile. @@ -3857,7 +3880,7 @@ def _validate_multi_agent_settings(self): # multi-agent. if ( self.is_multi_agent() - and self.uses_new_env_runners + and self.enable_env_runner_and_connector_v2 and self.num_envs_per_worker > 1 ): raise ValueError( @@ -3956,14 +3979,14 @@ def _validate_input_settings(self): def _validate_new_api_stack_settings(self): """Checks, whether settings related to the new API stack make sense.""" - if not self._enable_new_api_stack: + if not self.enable_rl_module_and_learner: # Throw a warning if the user has used `self.rl_module(rl_module_spec=...)` # but has not enabled the new API stack at the same time. if self._rl_module_spec is not None: logger.warning( "You have setup a RLModuleSpec (via calling " "`config.rl_module(...)`), but have not enabled the new API stack. " - "To enable it, call `config.experimental(_enable_new_api_stack=" + "To enable it, call `config.api_stack(enable_rl_module_and_learner=" "True)`." ) # Throw a warning if the user has used `self.training(learner_class=...)` @@ -3973,18 +3996,18 @@ def _validate_new_api_stack_settings(self): "You specified a custom Learner class (via " f"`AlgorithmConfig.training(learner_class={self._learner_class})`, " f"but have the new API stack disabled. You need to enable it via " - "`AlgorithmConfig.experimental(_enable_new_api_stack=True)`." + "`AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)`." ) # User is using the new EnvRunners, but forgot to switch on - # `_enable_new_api_stack`. - if self.uses_new_env_runners: + # `enable_rl_module_and_learner`. + if self.enable_env_runner_and_connector_v2: raise ValueError( "You are using the new API stack EnvRunners (SingleAgentEnvRunner " "or MultiAgentEnvRunner), but have forgotten to switch on the new " "API stack! Try setting " - "`config.experimental(_enable_new_api_stack=True)`." + "`config.api_stack(enable_rl_module_and_learner=True)`." ) - # Early out. The rest of this method is only for _enable_new_api_stack=True. + # Early out. The rest of this method is only for enable_rl_module_and_learner=True. return # New API stack (RLModule, Learner APIs) only works with connectors. @@ -4007,7 +4030,7 @@ def _validate_new_api_stack_settings(self): # gym.vector.Env yet and therefore the reset call is still made manually, # allowing for the callback to be fired). if ( - self.uses_new_env_runners + self.enable_env_runner_and_connector_v2 and not self.is_multi_agent() and self.callbacks_class is not DefaultCallbacks ): @@ -4044,8 +4067,8 @@ def _validate_new_api_stack_settings(self): "Cannot use `{}` option with the new API stack (RLModule and " "Learner APIs)! `{}` is part of the ModelV2 API and Policy API," " which are not compatible with the new API stack. You can either " - "deactivate the new stack via `config.experimental( " - "_enable_new_api_stack=False)`," + "deactivate the new stack via `config.api_stack( " + "enable_rl_module_and_learner=False)`," "or use the new stack (incl. RLModule API) and implement your " "custom model as an RLModule." ) @@ -4066,7 +4089,7 @@ def _validate_new_api_stack_settings(self): # anymore. def _validate_to_be_deprecated_settings(self): # Env task fn will be deprecated. - if self._enable_new_api_stack and self.env_task_fn is not None: + if self.enable_rl_module_and_learner and self.env_task_fn is not None: deprecation_warning( old="AlgorithmConfig.env_task_fn", help="The `env_task_fn` API is not supported on the new API stack! " @@ -4106,7 +4129,7 @@ def _validate_to_be_deprecated_settings(self): if self.simple_optimizer is True: pass # Multi-GPU setting: Must use MultiGPUTrainOneStep. - elif not self._enable_new_api_stack and self.num_gpus > 1: + elif not self.enable_rl_module_and_learner and self.num_gpus > 1: # TODO: AlphaStar uses >1 GPUs differently (1 per policy actor), so this is # ok for tf2 here. # Remove this hacky check, once we have fully moved to the Learner API. @@ -4350,6 +4373,11 @@ def rollouts(self, *args, **kwargs): def exploration(self, *args, **kwargs): return self.env_runners(*args, **kwargs) + @Deprecated(new="AlgorithmConfig.enable_env_runner_and_connector_v2", error=True) + @property + def uses_new_env_runners(self): + return None + class TorchCompileWhatToCompile(str, Enum): """Enumerates schemes of what parts of the TorchLearner can be compiled. diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 4f5caa1259f9..5228e6da3771 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -116,8 +116,8 @@ def __init__(self, algo_class=None): self.broadcast_interval = 1 self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" @@ -187,7 +187,7 @@ def training( networks and tuned the kl loss coefficients that are used during training. NOTE: This parameter is only applicable when using the Learner API - (_enable_new_api_stack=True). + (enable_rl_module_and_learner=True). Returns: @@ -272,7 +272,7 @@ def __init__(self, config, *args, **kwargs): # TODO(avnishn): Does this need to happen in __init__? I think we can move it # to setup() - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: self.workers.local_worker().foreach_policy_to_train( lambda p, _: p.update_target() ) @@ -290,7 +290,7 @@ def after_train_step(self, train_results: ResultDict) -> None: training step. """ - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if NUM_TARGET_UPDATES in train_results: self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES] self._counters[LAST_TARGET_UPDATE_TS] = train_results[ @@ -376,7 +376,7 @@ def get_default_policy_class( return APPOTorchPolicy elif config["framework"] == "tf": - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: raise ValueError( "RLlib's RLModule and Learner API is not supported for" " tf1. Use " diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 5f6ad462d43b..c39d09f3a989 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -86,7 +86,7 @@ def __init__( # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceClipGradients.__init__(self) @@ -111,7 +111,7 @@ def __init__( ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 3337f021c3c5..c62e01941ca3 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -74,7 +74,7 @@ def __init__(self, observation_space, action_space, config): # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack", False): + if not config.get("enable_rl_module_and_learner", False): # Although this is a no-op, we call __init__ here to make it clear # that base.__init__ will use the make_model() call. VTraceOptimizer.__init__(self) diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index c6f1b4d96307..a3fcf10d5612 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -56,7 +56,7 @@ def test_appo_loss(self): """Test that appo_policy_rlm loss matches the appo learner loss.""" config = ( appo.APPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -104,7 +104,7 @@ def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( appo.APPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") # Asynchronous Algo, make sure we have some results after 1 iteration. .reporting(min_time_s_per_iteration=10) diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index 1ff7b6fd0a74..7b460a59ae43 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -72,7 +72,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 41c45958299b..34e2e7e15990 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -87,7 +87,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/bc/bc.py b/rllib/algorithms/bc/bc.py index 29e2693f159c..f26e061600f4 100644 --- a/rllib/algorithms/bc/bc.py +++ b/rllib/algorithms/bc/bc.py @@ -73,7 +73,7 @@ def __init__(self, algo_class=None): # not important for behavioral cloning. self.postprocess_inputs = False # Set RLModule as default. - self.experimental(_enable_new_api_stack=True) + self.api_stack(enable_rl_module_and_learner=True) # __sphinx_doc_end__ # fmt: on @@ -137,7 +137,7 @@ def get_default_config(cls) -> AlgorithmConfig: @override(MARWIL) def training_step(self) -> ResultDict: - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Using ModelV2. return super().training_step() else: diff --git a/rllib/algorithms/bc/tests/test_bc.py b/rllib/algorithms/bc/tests/test_bc.py index 89acd8102caa..072bba5cb712 100644 --- a/rllib/algorithms/bc/tests/test_bc.py +++ b/rllib/algorithms/bc/tests/test_bc.py @@ -48,7 +48,7 @@ def test_bc_compilation_and_learning_from_offline_file(self): # Test for RLModule API and ModelV2. for rl_modules in [True, False]: - config.experimental(_enable_new_api_stack=rl_modules) + config.api_stack(enable_rl_module_and_learner=rl_modules) # Old and new stack support different frameworks if rl_modules: frameworks_to_test = ("torch", "tf2") diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 19908cfc04db..b170313e671f 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -124,8 +124,8 @@ def __init__(self, algo_class=None): # `training()` self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" self.lr = 5e-4 @@ -392,7 +392,7 @@ def validate(self) -> None: super().validate() if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.exploration_config["type"] == "ParameterNoise" ): if self.batch_mode != "complete_episodes": @@ -402,7 +402,7 @@ def validate(self) -> None: "batch_mode='complete_episodes')`." ) - if not self.uses_new_env_runners and not self.in_evaluation: + if not self.enable_env_runner_and_connector_v2 and not self.in_evaluation: validate_buffer_config(self) if self.td_error_loss_fn not in ["huber", "mse"]: @@ -423,7 +423,7 @@ def validate(self) -> None: # TODO (simon): Find a clean solution to deal with # configuration configs when using the new API stack. if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.exploration_config["type"] == "ParameterNoise" ): if self.batch_mode != "complete_episodes": @@ -446,7 +446,7 @@ def validate(self) -> None: ) if ( - self.uses_new_env_runners + self.enable_env_runner_and_connector_v2 and not isinstance(self.replay_buffer_config["type"], str) and not issubclass(self.replay_buffer_config["type"], EpisodeReplayBuffer) ): @@ -572,7 +572,7 @@ def training_step(self) -> ResultDict: The results dict from executing the training iteration. """ # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack() # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py index 437d35df82a1..373bbc9b654e 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -149,7 +149,7 @@ def __init__(self, algo_class=None): # with RLlib's `RemoteVectorEnv`). self.remote_worker_envs = True # Dreamer only runs on the new API stack. - self._enable_new_api_stack = True + self.enable_rl_module_and_learner = True # __sphinx_doc_end__ # fmt: on @@ -382,10 +382,10 @@ def validate(self) -> None: raise ValueError("DreamerV3 does NOT support multi-agent setups yet!") # Make sure, we are configure for the new API stack. - if not self._enable_new_api_stack: + if not self.enable_rl_module_and_learner: raise ValueError( - "DreamerV3 must be run with `config.experimental(" - "_enable_new_api_stack=True)`!" + "DreamerV3 must be run with `config.api_stack(" + "enable_rl_module_and_learner=True)`!" ) # If run on several Learners, the provided batch_size_B must be a multiple diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 41962010cf2e..6e2291230251 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -130,8 +130,8 @@ def __init__(self, algo_class=None): self.num_aggregation_workers = 0 self.grad_clip = 40.0 - # Note: Only when using _enable_new_api_stack=True can the clipping mode be - # configured by the user. On the old API stack, RLlib will always clip by + # Note: Only when using enable_rl_module_and_learner=True can the clipping mode + # be configured by the user. On the old API stack, RLlib will always clip by # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" @@ -231,7 +231,7 @@ def training( each SGD iteration. If "auto", will use the same value as `train_batch_size`. Note that this setting only has an effect if - `_enable_new_api_stack=True` and it must be a multiple of + `enable_rl_module_and_learner=True` and it must be a multiple of `rollout_fragment_length` or `sequence_length` and smaller than or equal to `train_batch_size`. num_sgd_iter: Number of passes to make over each train batch. @@ -366,13 +366,13 @@ def validate(self) -> None: # New stack w/ EnvRunners does NOT support aggregation workers yet or a mixin # replay buffer. - if self.uses_new_env_runners: + if self.enable_env_runner_and_connector_v2: if self.num_aggregation_workers > 0: raise ValueError( "Aggregation workers not supported on new API stack w/ new " "EnvRunner API! Set `config.num_aggregation_workers = 0` or " "disable the new API stack via " - "`config.experimental(_enable_new_api_stack=False)`." + "`config.api_stack(enable_rl_module_and_learner=False)`." ) if self.replay_ratio != 0.0: raise ValueError( @@ -387,7 +387,7 @@ def validate(self) -> None: ) # Entropy coeff schedule checking. - if self._enable_new_api_stack: + if self.enable_rl_module_and_learner: if self.entropy_coeff_schedule is not None: raise ValueError( "`entropy_coeff_schedule` is deprecated and must be None! Use the " @@ -428,7 +428,7 @@ def validate(self) -> None: ) # Learner API specific checks. if ( - self._enable_new_api_stack + self.enable_rl_module_and_learner and self._minibatch_size != "auto" and not ( (self.minibatch_size % self.rollout_fragment_length == 0) @@ -458,7 +458,7 @@ def minibatch_size(self): return ( ( self.train_batch_size_per_learner - if self.uses_new_env_runners + if self.enable_env_runner_and_connector_v2 else self.train_batch_size ) if self._minibatch_size == "auto" @@ -656,7 +656,7 @@ def setup(self, config: AlgorithmConfig): # update of the learner group self._results = {} - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Create and start the learner thread. self._learner_thread = make_learner_thread( self.workers.local_worker(), self.config @@ -667,7 +667,7 @@ def setup(self, config: AlgorithmConfig): def training_step(self) -> ResultDict: # First, check, whether our learner thread is still healthy. if ( - not self.config._enable_new_api_stack + not self.config.enable_rl_module_and_learner and not self._learner_thread.is_alive() ): raise RuntimeError("The learner thread died while training!") @@ -706,7 +706,7 @@ def training_step(self) -> ResultDict: self.concatenate_batches_and_pre_queue(batches) # Using the Learner API. Call `update()` on our LearnerGroup object with # all collected batches. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: train_results = self.learn_on_processed_samples() module_ids_to_update = set(train_results.keys()) - {ALL_MODULES} additional_results = self.learner_group.additional_update( @@ -734,7 +734,7 @@ def training_step(self) -> ResultDict: # Sync worker weights (only those policies that were actually updated). with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if train_results: pids = list(set(train_results.keys()) - {ALL_MODULES}) self.update_workers_from_learner_group( @@ -758,7 +758,7 @@ def training_step(self) -> ResultDict: mark_healthy=True, ) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: if train_results: # Store the most recent result and return it if no new result is # available. This keeps backwards compatibility with the old @@ -821,9 +821,9 @@ def default_resource_request( else [] ) ) - # TODO(avnishn): Remove this once we have a way to extend placement group - # factories. - if cf._enable_new_api_stack: + # TODO (avnishn): Remove this once we have a way to extend placement group + # factories. + if cf.enable_rl_module_and_learner: # Resources for the Algorithm. learner_bundles = cls._get_learner_bundles(cf) @@ -981,7 +981,7 @@ def learn_on_processed_samples(self) -> ResultDict: def place_processed_samples_on_learner_thread_queue(self) -> None: """Place processed samples on the learner queue for training. - NOTE: This method is called if self.config._enable_new_api_stack is False. + NOTE: This method is called if self.config.enable_rl_module_and_learner is False. """ for i, batch in enumerate(self.batches_to_place_on_learner): @@ -1008,7 +1008,7 @@ def place_processed_samples_on_learner_thread_queue(self) -> None: def process_trained_results(self) -> ResultDict: """Process training results that are outputed by the learner thread. - NOTE: This method is called if self.config._enable_new_api_stack is False. + NOTE: This method is called if self.config.enable_rl_module_and_learner is False. Returns: Aggregated results from the learner thread after an update is completed. @@ -1234,7 +1234,7 @@ def _get_additional_update_kwargs(self, train_results: dict) -> dict: @override(Algorithm) def _compile_iteration_results(self, *args, **kwargs): result = super()._compile_iteration_results(*args, **kwargs) - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: result = self._learner_thread.add_learner_metrics( result, overwrite_learner_info=False ) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 555a15cbb8b4..6b8b592454f9 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -178,7 +178,7 @@ def compute_gradients_fn( self, optimizer: LocalOptimizer, loss: TensorType ) -> ModelGradients: # Supporting more than one loss/optimizer. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # In order to access the variables for rl modules, we need to # use the underlying keras api model.trainable_variables. trainable_variables = self.model.trainable_variables @@ -302,7 +302,7 @@ def __init__( # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not self.config.get("_enable_new_api_stack"): + if not self.config.get("enable_rl_module_and_learner"): GradStatsMixin.__init__(self) VTraceClipGradients.__init__(self) VTraceOptimizer.__init__(self) diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index c6f9a19eeb57..547c2405acb0 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -242,7 +242,7 @@ def __init__(self, observation_space, action_space, config): # However, we also would like to avoid creating special Policy-subclasses # for this as the entire Policy concept will soon not be used anymore with # the new Learner- and RLModule APIs. - if not config.get("_enable_new_api_stack"): + if not config.get("enable_rl_module_and_learner"): VTraceOptimizer.__init__(self) # Need to initialize learning rate variable before calling # TorchPolicyV2.__init__. diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index b4b90cb8305a..2e55e21eabae 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -58,7 +58,7 @@ def test_impala_loss(self): """ config = ( ImpalaConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index 0cf8ec62875c..d9f3d7ecf621 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -23,7 +23,7 @@ def tearDownClass(cls) -> None: def test_impala_off_policyness(self): config = ( impala.ImpalaConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .resources(num_gpus=0) .env_runners(num_env_runners=4) diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index 1f93aff30767..9d0c084a25b8 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -60,7 +60,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index c93a5c885c6c..ff061ab257b7 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -68,7 +68,7 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] else: bootstrap_values_time_major = make_time_major( diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 0d4741625b72..8f2281648eb8 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -321,7 +321,7 @@ def validate(self) -> None: # we subsample a batch of `sgd_minibatch_size` from the train-batch for # each `num_sgd_iter`). if ( - not self._enable_new_api_stack + not self.enable_rl_module_and_learner and self.sgd_minibatch_size > self.train_batch_size ): raise ValueError( @@ -331,7 +331,7 @@ def validate(self) -> None: f"is iterated over (used for updating the policy) {self.num_sgd_iter} " "times." ) - elif self._enable_new_api_stack: + elif self.enable_rl_module_and_learner: mbs = self.mini_batch_size_per_learner or self.sgd_minibatch_size tbs = self.train_batch_size_per_learner or self.train_batch_size if mbs > tbs: @@ -359,7 +359,7 @@ def validate(self) -> None: ) # Entropy coeff schedule checking. - if self._enable_new_api_stack: + if self.enable_rl_module_and_learner: if self.entropy_coeff_schedule is not None: raise ValueError( "`entropy_coeff_schedule` is deprecated and must be None! Use the " @@ -407,7 +407,7 @@ def get_default_policy_class( @override(Algorithm) def training_step(self): # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack() # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). @@ -423,14 +423,14 @@ def _training_step_new_api_stack(self) -> ResultDict: worker_set=self.workers, max_agent_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=self.config.enable_env_runner_and_connector_v2, ) else: episodes = synchronous_parallel_sample( worker_set=self.workers, max_env_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=self.config.enable_env_runner_and_connector_v2, ) # Return early if all our workers failed. if not episodes: @@ -521,7 +521,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: train_batch = standardize_fields(train_batch, ["advantages"]) # Perform a train step on the collected batch. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: mini_batch_size_per_learner = ( self.config.mini_batch_size_per_learner or self.config.sgd_minibatch_size @@ -537,7 +537,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: else: train_results = multi_gpu_train_one_step(self, train_batch) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: # The train results's loss keys are pids to their loss values. But we also # return a total_loss key at the same level as the pid keys. So we need to # subtract that to get the total set of pids to update. @@ -563,7 +563,7 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: if self.workers.num_remote_workers() > 0: from_worker_or_learner_group = None - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: # sync weights from learner_group to all rollout workers from_worker_or_learner_group = self.learner_group self.workers.sync_weights( @@ -571,11 +571,11 @@ def _training_step_old_and_hybrid_api_stacks(self) -> ResultDict: policies=policies_to_update, global_vars=global_vars, ) - elif self.config._enable_new_api_stack: + elif self.config.enable_rl_module_and_learner: weights = self.learner_group.get_weights() self.workers.local_worker().set_weights(weights) - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: kl_dict = {} if self.config.use_kl_loss: for pid in policies_to_update: diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index d31c8e47552e..9b6c6f3d1876 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -60,7 +60,7 @@ def _update_from_batch_or_episodes( ): # First perform GAE computation on the entirety of the given train data (all # episodes). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: batch, episodes = self._compute_gae_from_episodes(episodes=episodes) # Now that GAE (advantages and value targets) have been added to the train # batch, we can proceed normally (calling super method) with the update step. diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 8541995302e0..914cc9b34bff 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -128,9 +128,6 @@ def test_ppo_compilation_w_connectors(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when the scheduler is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .training( num_sgd_iter=2, # Setup lr schedule for testing. @@ -202,9 +199,6 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when the scheduler is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .training( # Setup lr schedule for testing. lr_schedule=[[0, 5e-5], [256, 0.0]], @@ -275,7 +269,6 @@ def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() - # .experimental(_enable_new_api_stack=True) .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, @@ -325,9 +318,6 @@ def test_ppo_free_log_std(self): config = ( ppo.PPOConfig() - # TODO (Kourosh): Enable when free log std is supported in the new - # Learner API stack. - .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -392,7 +382,6 @@ def test_ppo_loss_function(self): """ config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=False) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 2d897caca974..cc48fc6b1ca9 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -56,7 +56,7 @@ def tearDownClass(cls): def test_loss(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -104,7 +104,7 @@ def test_save_load_state(self): """Tests saving and loading the state of the PPO Learner Group.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -142,7 +142,7 @@ def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_env_runners=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index bc318db78bff..9391269aa63e 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -75,7 +75,7 @@ def test_ppo_compilation_and_schedule_mixins(self): config = ( ppo.PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 724f64374ed2..6e9810093c96 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -78,7 +78,7 @@ def test_ppo_compilation_and_schedule_mixins(self): # Build a PPOConfig object. config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .training( num_sgd_iter=2, # Setup lr schedule for testing lr-scheduling correctness. @@ -137,7 +137,7 @@ def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, @@ -181,7 +181,7 @@ def test_ppo_free_log_std_with_rl_modules(self): """Tests the free log std option works.""" config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("Pendulum-v1") .env_runners( num_env_runners=1, diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 87fd6830f36e..56246981bac9 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -364,7 +364,7 @@ def validate(self) -> None: # Validate that we use the corresponding `EpisodeReplayBuffer` when using # episodes. # TODO (sven, simon): Implement the multi-agent case for replay buffers. - if self.uses_new_env_runners and self.replay_buffer_config["type"] not in [ + if self.enable_env_runner_and_connector_v2 and self.replay_buffer_config["type"] not in [ "EpisodeReplayBuffer", "PrioritizedEpisodeReplayBuffer", ]: @@ -449,7 +449,7 @@ def get_default_policy_class( def training_step(self) -> ResultDict: # If `RolloutWorker` is used, fall back to the old stack `training step` # of `DQN`. - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: return super().training_step() # Alternate between storing and sampling and training. @@ -464,7 +464,7 @@ def training_step(self) -> ResultDict: episodes = synchronous_parallel_sample( worker_set=self.workers, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=self.config.enable_env_runner_and_connector_v2, ) # TODO (sven): single- vs multi-agent. self._counters[NUM_AGENT_STEPS_SAMPLED] += sum(len(e) for e in episodes) diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 9d50086c9398..522ce154fee8 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -171,7 +171,7 @@ def test_detect_atari_env(self): def test_rl_module_api(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .framework("torch") .env_runners(enable_connectors=True) @@ -231,7 +231,7 @@ def test_config_per_module(self): def test_learner_api(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners(enable_connectors=True) .framework("tf2") @@ -360,7 +360,7 @@ def get_default_rl_module_spec(self): ######################################## # This is the simplest case where we have to construct the marl module based on # the default specs only. - config = SingleAgentAlgoConfig().experimental(_enable_new_api_stack=True) + config = SingleAgentAlgoConfig().api_stack(enable_rl_module_and_learner=True) spec, expected = self._get_expected_marl_spec(config, DiscreteBCTorchModule) self._assertEqualMARLSpecs(spec, expected) @@ -376,7 +376,7 @@ def get_default_rl_module_spec(self): # algorithm to assign a specific type of RLModule class to certain module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( module_specs={ @@ -395,7 +395,7 @@ def get_default_rl_module_spec(self): # RLModule class to ALL module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), ) @@ -414,7 +414,7 @@ def get_default_rl_module_spec(self): # RLModule class to ALL module_ids. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) @@ -437,7 +437,7 @@ def get_default_rl_module_spec(self): # in the multi-agent scenario. config = ( SingleAgentAlgoConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( marl_module_class=CustomMARLModule1, @@ -474,8 +474,9 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, but the MultiAgentRLModuleSpec has not defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfigWithNoSingleAgentSpec().experimental( - _enable_new_api_stack=True + config = ( + MultiAgentAlgoConfigWithNoSingleAgentSpec() + .api_stack(enable_rl_module_and_learner=True) ) self.assertRaisesRegex( @@ -488,7 +489,7 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, and the MultiAgentRLModuleSpec has defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfig().experimental(_enable_new_api_stack=True) + config = MultiAgentAlgoConfig().api_stack(enable_rl_module_and_learner=True) spec, expected = self._get_expected_marl_spec( config, DiscreteBCTorchModule, expected_marl_module_class=CustomMARLModule1 diff --git a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py index 8a3579cb5339..d5ddec5c79f7 100644 --- a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py +++ b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py @@ -29,7 +29,7 @@ def save_test(alg_name, framework="tf", multi_agent=False): ) if alg_name in RLMODULE_SUPPORTED_ALGOS: - config = config.experimental(_enable_new_api_stack=False) + config = config.api_stack(enable_rl_module_and_learner=False) if "DDPG" in alg_name or "SAC" in alg_name: config.environment("Pendulum-v1") diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 02b007eab2aa..7b7c66ad96a3 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -72,7 +72,7 @@ def tearDownClass(cls): def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_rollout_workers=0, @@ -115,7 +115,7 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( batch_mode="complete_episodes", @@ -158,7 +158,7 @@ def test_overriding_on_episode_created_throws_error_on_new_api_stack(self): """Tests, whw""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=SingleAgentEnvRunner) .callbacks(OnEpisodeCreatedCallback) ) diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index 6ccd1f6ef809..92551b8c0646 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -389,7 +389,7 @@ def test_fatal_single_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, env_to_module_connector=lambda env: FlattenObservations(), @@ -400,7 +400,7 @@ def test_fatal_multi_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=MultiAgentEnvRunner) .multi_agent(policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0"), ) @@ -409,7 +409,7 @@ def test_fatal_multi_agent(self): # def test_async_samples(self): # self._do_test_fault_ignore( # ImpalaConfig() - # .experimental(_enable_new_api_stack=True) + # .api_stack(enable_rl_module_and_learner=True) # .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) # .resources(num_gpus=0) # ) @@ -417,7 +417,7 @@ def test_fatal_multi_agent(self): def test_sync_replay(self): self._do_test_failing_ignore( SACConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment( env_config={"action_space": gym.spaces.Box(0, 1, (2,), np.float32)} ) @@ -429,7 +429,7 @@ def test_sync_replay(self): def test_multi_gpu(self): self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, ) @@ -443,7 +443,7 @@ def test_multi_gpu(self): def test_sync_samples(self): self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(optimizer={}) ) @@ -452,7 +452,7 @@ def test_eval_workers_failing_ignore(self): # Test the case where one eval worker fails, but we chose to ignore. self._do_test_failing_ignore( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(model={"fcnet_hiddens": [4]}), fail_eval=True, @@ -462,7 +462,7 @@ def test_eval_workers_parallel_to_training_failing_recover(self): # Test the case where all eval workers fail, but we chose to recover. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .evaluation( evaluation_num_env_runners=1, @@ -482,7 +482,7 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( # to recover. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) .multi_agent( policies={"main", "p0", "p1"}, @@ -518,7 +518,7 @@ def test_workers_failing_recover(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -572,7 +572,7 @@ def test_modules_are_restored_on_recovered_worker(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent, num_env_runners=2, @@ -674,7 +674,7 @@ def test_eval_workers_failing_recover(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -740,7 +740,7 @@ def test_worker_failing_recover_with_hanging_workers(self): # the execution of the algorithm b/c of a single heavily stalling worker. # Timeout data (batches or episodes) are discarded. SACConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .training( replay_buffer_config={"type": "EpisodeReplayBuffer"}, ) diff --git a/rllib/connectors/agent/state_buffer.py b/rllib/connectors/agent/state_buffer.py index 91e22990560a..97361f790373 100644 --- a/rllib/connectors/agent/state_buffer.py +++ b/rllib/connectors/agent/state_buffer.py @@ -33,7 +33,7 @@ def __init__(self, ctx: ConnectorContext, states: Any = None): self._action_space_struct = get_base_struct_from_space(ctx.action_space) self._states = defaultdict(lambda: defaultdict(lambda: (None, None, None))) - self._enable_new_api_stack = ctx.config.get("_enable_new_api_stack", False) + self._enable_new_api_stack = ctx.config.get("enable_rl_module_and_learner", False) # TODO(jungong) : we would not need this if policies are never stashed # during the rollout of a single episode. if states: diff --git a/rllib/connectors/agent/view_requirement.py b/rllib/connectors/agent/view_requirement.py index 1a079792bd32..f95e3fec8d35 100644 --- a/rllib/connectors/agent/view_requirement.py +++ b/rllib/connectors/agent/view_requirement.py @@ -36,7 +36,7 @@ def __init__(self, ctx: ConnectorContext): super().__init__(ctx) self._view_requirements = ctx.view_requirements - _enable_new_api_stack = ctx.config.get("_enable_new_api_stack", False) + _enable_new_api_stack = ctx.config.get("enable_rl_module_and_learner", False) # a dict of env_id to a dict of agent_id to a list of agent_collector objects self.agent_collectors = defaultdict( diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 883e02b57bdb..d0af75fb6c6d 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -299,7 +299,7 @@ def build(self) -> None: return # Build learner connector pipeline used on this Learner worker. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # TODO (sven): Figure out which space to provide here. For now, # it doesn't matter, as the default connector piece doesn't use # this information anyway. diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index b253feb1a5e8..f7f81074b32a 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -388,7 +388,7 @@ def build_vf_head(self, framework): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyCatalog), ) diff --git a/rllib/core/testing/tests/test_bc_algorithm.py b/rllib/core/testing/tests/test_bc_algorithm.py index a41250c43c4e..fe798a4a4846 100644 --- a/rllib/core/testing/tests/test_bc_algorithm.py +++ b/rllib/core/testing/tests/test_bc_algorithm.py @@ -33,8 +33,8 @@ def test_bc_algorithm(self): config = ( BCConfigTest() + .api_stack(enable_rl_module_and_learner=True) .training(model={"fcnet_hiddens": [32, 32]}) - .experimental(_enable_new_api_stack=True) ) # TODO (Kourosh): Add tf2 support @@ -54,7 +54,7 @@ def test_bc_algorithm_marl(self): policies = {"policy_1", "policy_2"} config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .training(model={"fcnet_hiddens": [32, 32]}) .multi_agent( policies=policies, @@ -98,7 +98,7 @@ def test_bc_algorithm_w_custom_marl_module(self): config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework(fw) .rl_module(rl_module_spec=spec) .training( diff --git a/rllib/env/tests/test_multi_agent_env_runner.py b/rllib/env/tests/test_multi_agent_env_runner.py index d27f4779bd67..5cb1a889feff 100644 --- a/rllib/env/tests/test_multi_agent_env_runner.py +++ b/rllib/env/tests/test_multi_agent_env_runner.py @@ -95,7 +95,7 @@ def _build_config(self): # Build the configuration and use `PPO`. config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment( MultiAgentCartPole, env_config={"num_agents": 2}, diff --git a/rllib/evaluation/env_runner_v2.py b/rllib/evaluation/env_runner_v2.py index 2c095fe4b12c..8b6a0910bacf 100644 --- a/rllib/evaluation/env_runner_v2.py +++ b/rllib/evaluation/env_runner_v2.py @@ -181,7 +181,7 @@ def _build_multi_agent_batch( policy = collector.policy - if policy.config.get("_enable_new_api_stack", False): + if policy.config.get("enable_rl_module_and_learner", False): # Before we send the collected batch back for training, we may need # to add a time dimension for the RLModule. seq_lens = batch.get(SampleBatch.SEQ_LENS) @@ -1072,7 +1072,7 @@ def _try_find_policy_again(eval_data: AgentConnectorDataType): # changed (mapping fn not staying constant within one episode). policy: Policy = _try_find_policy_again(eval_data) - if policy.config.get("_enable_new_api_stack", False): + if policy.config.get("enable_rl_module_and_learner", False): # _batch_inference_sample_batches does nothing but concatenating AND # setting SEQ_LENS to ones in the recurrent case. We do not need this # because RLModules do not care about SEQ_LENS anymore. They have an diff --git a/rllib/evaluation/episode_v2.py b/rllib/evaluation/episode_v2.py index 25ed5a36a719..03338bcc28ca 100644 --- a/rllib/evaluation/episode_v2.py +++ b/rllib/evaluation/episode_v2.py @@ -191,7 +191,7 @@ def add_init_obs( ), is_policy_recurrent=policy.is_recurrent(), intial_states=policy.get_initial_state(), - _enable_new_api_stack=policy.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=policy.config.get("enable_rl_module_and_learner", False), ) self._agent_collectors[agent_id].add_init_obs( episode_id=self.episode_id, diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 65ebfc3350c5..6c8afce541d5 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -270,7 +270,7 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample input_dict = sample_batch.get_single_step_input_dict( policy.view_requirements, index="last" ) - if policy.config.get("_enable_new_api_stack"): + if policy.config.get("enable_rl_module_and_learner"): # Note: During sampling you are using the parameters at the beginning of # the sampling process. If I'll be using this advantages during training # should it not be the latest parameters during training for this to be diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index f0623b3cbb00..fe5852467408 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -492,7 +492,7 @@ def wrap(env): ) # This is only for the old API where local_worker was responsible for learning - if not self.config._enable_new_api_stack: + if not self.config.enable_rl_module_and_learner: # Error if we don't find enough GPUs. if ( ray.is_initialized() @@ -537,7 +537,7 @@ def wrap(env): # state. for pol in self.policy_map.values(): if not pol._model_init_state_automatically_added and not pol.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ): pol._update_model_view_requirements_from_init_state() @@ -1127,7 +1127,7 @@ def add_policy( """ validate_policy_id(policy_id, error=False) - if module_spec is not None and not self.config._enable_new_api_stack: + if module_spec is not None and not self.config.enable_rl_module_and_learner: raise ValueError( "If you pass in module_spec to the policy, the RLModule API needs " "to be enabled." @@ -1717,7 +1717,7 @@ def _update_policy_map( updated_policy_dict = self._get_complete_policy_specs_dict(policy_dict) # Use the updated policy dict to create the marl_module_spec if necessary - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: spec = self.config.get_marl_module_spec( policy_dict=updated_policy_dict, single_agent_rl_module_spec=single_agent_rl_module_spec, @@ -1796,7 +1796,7 @@ def _get_complete_policy_specs_dict( obs_space, merged_conf.model, include_multi_binary=self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), ) # Original observation space should be accessible at @@ -1863,7 +1863,7 @@ def _build_policy_map( new_policy = policy # Maybe torch compile an RLModule. - if self.config.get("_enable_new_api_stack", False) and self.config.get( + if self.config.get("enable_rl_module_and_learner", False) and self.config.get( "torch_compile_worker" ): if self.config.framework_str != "torch": diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index 03f1b51f2130..7f90c15dc541 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -83,7 +83,7 @@ def test_env_crash_on_one_worker_during_sampling_but_ignore(self): """Expect some sub-envs on one worker to fail (and not recover), but ignore.""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( num_env_runners=2, num_envs_per_env_runner=3, @@ -119,7 +119,7 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): """Expect some sub-envs to fail (and not recover), but re-create worker.""" config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( # env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 3cd81eb6dce3..fdb1e128df35 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -105,7 +105,7 @@ def test_traj_view_lstm_prev_actions_and_rewards(self): # and Learner API. config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("CartPole-v1") # Activate LSTM + prev-action + rewards. .training( @@ -185,7 +185,7 @@ def test_traj_view_attention_net(self): config = ( ppo.PPOConfig() # Batch-norm models have not been migrated to the RL Module API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment( "ray.rllib.examples.envs.classes.debug_counter_env.DebugCounterEnv", env_config={"config": {"start_at_t": 1}}, # first obs is [1.0] @@ -227,7 +227,7 @@ def test_traj_view_next_action(self): action_space = Discrete(2) config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("torch") .env_runners(rollout_fragment_length=200, num_env_runners=0) ) @@ -305,7 +305,7 @@ def policy_fn(agent_id, episode, worker, **kwargs): config = ( ppo.PPOConfig() # The Policy used to be passed in, now we have to pass in the RLModuleSpecs - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .framework("torch") .multi_agent(policies=policies, policy_mapping_fn=policy_fn) .training( @@ -330,7 +330,7 @@ def test_counting_by_agent_steps(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) # Env setup. .environment(MultiAgentPendulum, env_config={"num_agents": num_agents}) .env_runners(num_env_runners=2, rollout_fragment_length=21) diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index fd8f18b0932f..e4d0aba5d58d 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -238,7 +238,7 @@ def _setup( if ( local_worker and self.__worker_manager.num_actors() > 0 - and not config.uses_new_env_runners + and not config.enable_env_runner_and_connector_v2 and not config.create_env_on_local_worker and (not config.observation_space or not config.action_space) ): diff --git a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py index 156ead8f3341..47ce9b92c884 100644 --- a/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py +++ b/rllib/examples/_old_api_stack/policy/episode_env_aware_policy.py @@ -37,7 +37,7 @@ def is_recurrent(self): return True def get_initial_state(self): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # convert the tree of tensors to a tree to numpy arrays return tree.map_structure( lambda s: convert_to_numpy(s), self.model.get_initial_state() diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index 42d2613106d0..c9bab618fdf1 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -109,10 +109,8 @@ def get_cli_args(): ) # We need to disable preprocessing of observations, because preprocessing # would flatten the observation dict of the environment. - .experimental( - _enable_new_api_stack=True, - _disable_preprocessor_api=True, - ) + .api_stack(enable_rl_module_and_learner=True) + .experimental(_disable_preprocessor_api=True) .framework(args.framework) .resources( # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 686bb08d7f9c..b4b938ff9881 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -192,8 +192,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( AlgorithmConfig() - # TODO (Kourosh): Migrate this to the new RLModule / Learner API. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("multi_agent_cartpole") .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index 5d073bf94b15..f81ae348c1f9 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -140,7 +140,7 @@ def get_cli_args(): get_trainable_cls(args.run) .get_default_config() # Batch-norm models have not been migrated to the RL Module API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment(CorrelatedActionsEnv) .framework(args.framework) .training(gamma=0.5) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index 4f94994d8586..c2e48cbab0c5 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -44,7 +44,7 @@ def _get_encoder_config( # Create a generic config with our enhanced Catalog ppo_config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec( catalog_class=MobileNetEnhancedPPOCatalog diff --git a/rllib/examples/checkpoints/onnx_tf.py b/rllib/examples/checkpoints/onnx_tf.py index 0093afd0fd9e..348de227c075 100644 --- a/rllib/examples/checkpoints/onnx_tf.py +++ b/rllib/examples/checkpoints/onnx_tf.py @@ -25,7 +25,7 @@ config = ( ppo.PPOConfig() # ONNX is not supported by RLModule API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .env_runners(num_rollout_workers=1) .framework(args.framework) ) diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index 008be01378a7..963c665fc466 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -14,7 +14,7 @@ config = ( ppo.PPOConfig() # ONNX is not supported by RLModule API yet. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .env_runners(num_rollout_workers=1) .framework("torch") ) diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index 8b819941c98b..a59327c7e919 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -89,7 +89,7 @@ check(results1["hist_stats"], results2["hist_stats"]) # As well as training behavior (minibatch sequence during SGD # iterations). - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: check( results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py index 764df7e3721c..87fdbc4feaf6 100755 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ b/rllib/examples/envs/external_envs/cartpole_server.py @@ -186,7 +186,7 @@ def _input(ioctx): # Disable RLModules because they need connectors # TODO (Sven): Deprecate ExternalEnv (via EnvRunner path) and reenable connectors # and RL Modules here. - config.experimental(_enable_new_api_stack=False) + config.api_stack(enable_rl_module_and_learner=False) # DQN. if args.run == "DQN" or args.run == "APEX" or args.run == "R2D2": diff --git a/rllib/examples/inference/policy_inference_after_training_with_attention.py b/rllib/examples/inference/policy_inference_after_training_with_attention.py index fcc519429151..5151b9f3a82d 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_attention.py +++ b/rllib/examples/inference/policy_inference_after_training_with_attention.py @@ -78,7 +78,7 @@ get_trainable_cls(args.run) .get_default_config() # TODO (Kourosh): Enable when Attentions are supported. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("FrozenLake-v1") # Run with tracing enabled for tf2? .framework(args.framework) diff --git a/rllib/examples/inference/policy_inference_after_training_with_lstm.py b/rllib/examples/inference/policy_inference_after_training_with_lstm.py index 194aca4865e0..5117361a350c 100644 --- a/rllib/examples/inference/policy_inference_after_training_with_lstm.py +++ b/rllib/examples/inference/policy_inference_after_training_with_lstm.py @@ -130,7 +130,7 @@ # Set LSTM's initial internal state. lstm_cell_size = config["model"]["lstm_cell_size"] # range(2) b/c h- and c-states of the LSTM. - if algo.config._enable_new_api_stack: + if algo.config.enable_rl_module_and_learner: init_state = state = algo.get_policy().model.get_initial_state() else: init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)] diff --git a/rllib/examples/learners/ppo_load_rl_modules.py b/rllib/examples/learners/ppo_load_rl_modules.py index 6b87ba4d50a1..ef8ebf7684f7 100644 --- a/rllib/examples/learners/ppo_load_rl_modules.py +++ b/rllib/examples/learners/ppo_load_rl_modules.py @@ -58,7 +58,7 @@ def _parse_args(): # train a PPO algorithm with the loaded module config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework(args.framework) .rl_module(rl_module_spec=module_to_load_spec) .environment("CartPole-v1") diff --git a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py index 22830e42b39e..ddfa9a0a1164 100644 --- a/rllib/examples/learners/train_w_bc_finetune_w_ppo.py +++ b/rllib/examples/learners/train_w_bc_finetune_w_ppo.py @@ -114,7 +114,7 @@ def train_ppo_agent_from_checkpointed_module( """ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module(rl_module_spec=module_spec_from_ckpt) .environment(GYM_ENV_NAME) .training( diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index 8888cb33e708..031709f151f9 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -160,7 +160,7 @@ def _get_multi_agent(): get_trainable_cls(args.algo) .get_default_config() # Use new API stack ... - .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play @@ -255,7 +255,7 @@ def _get_multi_agent(): action = ask_user_for_action(time_step) else: obs = np.array(time_step.observations["info_state"][player_id]) - if config.uses_new_env_runners: + if config.enable_env_runner_and_connector_v2: action = algo.workers.local_worker().module.forward_inference( {"obs": obs} ) diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index 5ad574f19980..30aa8cf1fe41 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -106,7 +106,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( get_trainable_cls(args.algo) .get_default_config() - .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play diff --git a/rllib/examples/multi_agent/two_algorithms.py b/rllib/examples/multi_agent/two_algorithms.py index 43e75b4b414f..8cffdaf4173f 100644 --- a/rllib/examples/multi_agent/two_algorithms.py +++ b/rllib/examples/multi_agent/two_algorithms.py @@ -82,7 +82,7 @@ def select_policy(algorithm, framework): # Construct two independent Algorithm configs ppo_config = ( PPOConfig() - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment("multi_agent_cartpole") .framework(args.framework) # disable filters, otherwise we would need to synchronize those diff --git a/rllib/examples/ray_serve/ray_serve_with_rllib.py b/rllib/examples/ray_serve/ray_serve_with_rllib.py index be1432acadf1..3a73813eec37 100644 --- a/rllib/examples/ray_serve/ray_serve_with_rllib.py +++ b/rllib/examples/ray_serve/ray_serve_with_rllib.py @@ -68,7 +68,7 @@ def kill_proc(proc): # Config for the served RLlib RLModule/Algorithm. config = ( - PPOConfig().experimental(_enable_new_api_stack=True).environment("CartPole-v1") + PPOConfig().api_stack(enable_rl_module_and_learner=True).environment("CartPole-v1") ) # Train the Algorithm for some time, then save it and get the checkpoint path. diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 6293d8f43a9a..97ec864bd504 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -58,7 +58,7 @@ def my_experiment(config: Dict): config = ( PPOConfig() .update_from_dict(config) - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") ) @@ -155,7 +155,7 @@ def my_experiment(config: Dict): if __name__ == "__main__": base_config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .env_runners( num_rollout_workers=0, diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index bbd3869e4477..a76a4d4981ab 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -83,7 +83,7 @@ def flush(self): if __name__ == "__main__": config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=SingleAgentEnvRunner) .environment("CartPole-v1") # Setting up a custom logger config. diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index 8a487fc9a436..9c8cdf365196 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -89,7 +89,7 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(env_runner_cls=MultiAgentEnvRunner) .environment("env") .multi_agent( diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index b3827fd010bc..1878ac75b675 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -56,7 +56,7 @@ def setup(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .rl_module( rl_module_spec=SingleAgentRLModuleSpec(module_class=MobileNetTorchPPORLModule) ) diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index cfe8cec79155..05d736945ed7 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -39,7 +39,7 @@ def tearDownClass(cls) -> None: def test_rlms_and_preprocessing(self): config = ( ppo.PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("tf2") .environment( env="ray.rllib.examples.envs.classes.random_env.RandomEnv", diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 94f716530354..1edce9f42dce 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -180,7 +180,7 @@ def compute_actions_from_input_dict( # Create a traced version of `self._compute_actions_helper`. if self._traced_compute_actions_helper is False and not self._no_tracing: - if self.config.get("_enable_new_api_stack"): + if self.config.get("enable_rl_module_and_learner"): self._compute_actions_helper_rl_module_explore = ( _convert_eager_inputs( tf.function( @@ -442,7 +442,7 @@ def __init__(self, observation_space, action_space, config): # action). self._lock = threading.RLock() - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -754,7 +754,7 @@ def get_state(self) -> PolicyState: if self._optimizer and len(self._optimizer.variables()) > 0: state["_optimizer_variables"] = self._optimizer.variables() # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index cc978c5ac677..25858d7eed91 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -112,7 +112,7 @@ def __init__( # If using default make_model(), dist_class will get updated when # the model is created next. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.model = self.make_rl_module() self.dist_class = None else: @@ -121,7 +121,7 @@ def __init__( self._init_view_requirements() - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.exploration = None else: self.exploration = self._create_exploration() @@ -158,10 +158,10 @@ def enable_eager_execution_if_necessary(): @override(Policy) def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): assert self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("_enable_new_api_stack", False) and self.model.is_stateful(): + if self.config.get("enable_rl_module_and_learner", False) and self.model.is_stateful(): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -213,7 +213,7 @@ def loss( Returns: A single loss tensor or a list of loss tensors. """ - # Under the new _enable_new_api_stack the loss function still gets called in + # Under the new enable_rl_module_and_learner the loss function still gets called in # order to initialize the view requirements of the sample batches that are # returned by the sampler. In this case, we don't actually want to compute any # loss, however @@ -221,7 +221,7 @@ def loss( # sampler will include those keys in the sample batches it returns. This means # that the correct sample batch keys will be available when using the learner # group API. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): for k in model.input_specs_train(): train_batch[k] return None @@ -442,7 +442,7 @@ def _init_dist_class(self): return dist_class def _init_view_requirements(self): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -458,7 +458,7 @@ def _init_view_requirements(self): self.view_requirements[SampleBatch.INFOS].used_for_training = False def maybe_initialize_optimizer_and_loss(self): - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): optimizers = force_list(self.optimizer()) if self.exploration: # Policies with RLModules don't have an exploration object. @@ -509,7 +509,7 @@ def compute_actions_from_input_dict( timestep=timestep, explore=explore, tf_sess=self.get_session() ) - if self.config.get("_enable_new_api_stack"): + if self.config.get("enable_rl_module_and_learner"): # For recurrent models, we need to add a time dimension. seq_lens = input_dict.get("seq_lens", None) if seq_lens is None: @@ -627,7 +627,7 @@ def compute_log_likelihoods( action_dist = self.dist_class(dist_inputs, self.model) # Default log-likelihood calculation. else: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): if in_training: output = self.model.forward_train(input_batch) action_dist_cls = self.model.get_train_action_dist_cls() @@ -783,7 +783,7 @@ def get_state(self) -> PolicyState: state["global_timestep"] = state["global_timestep"].numpy() # In the new Learner API stack, the optimizers live in the learner. state["_optimizer_variables"] = [] - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): if self._optimizer and len(self._optimizer.variables()) > 0: state["_optimizer_variables"] = self._optimizer.variables() diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index c0e78b5a10c8..4b348771ec8b 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -401,7 +401,7 @@ def make_rl_module(self) -> "RLModule": """Returns the RL Module (only for when RLModule API is enabled.) If RLModule API is enabled - (self.config.experimental(_enable_new_api_stack=True), this method should be + (self.config.api_stack(enable_rl_module_and_learner=True), this method should be implemented and should return the RLModule instance to use for this Policy. Otherwise, RLlib will error out. """ @@ -521,7 +521,7 @@ def compute_single_action( if input_dict is None: input_dict = {SampleBatch.OBS: obs} if state is not None: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): input_dict["state_in"] = state else: for i, s in enumerate(state): @@ -1255,7 +1255,7 @@ def _get_num_gpus_for_policy(self) -> int: # If we are on the new RLModule/Learner stack, `num_gpus` is deprecated. # so use `num_gpus_per_worker` for policy sampling # we need this .get() syntax here to ensure backwards compatibility. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): num_gpus = self.config["num_gpus_per_worker"] else: # If head node, take num_gpus. @@ -1391,12 +1391,12 @@ def _initialize_loss_from_dummy_batch( self._lazy_tensor_dict(self._dummy_batch) # With RL Modules you want the explore flag to be True for initialization # of the tensors and placeholder you'd need for training. - explore = self.config.get("_enable_new_api_stack", False) + explore = self.config.get("enable_rl_module_and_learner", False) actions, state_outs, extra_outs = self.compute_actions_from_input_dict( self._dummy_batch, explore=explore ) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): for key, view_req in self.view_requirements.items(): if key not in self._dummy_batch.accessed_keys: view_req.used_for_compute_actions = False @@ -1446,7 +1446,7 @@ def _initialize_loss_from_dummy_batch( seq_lens = None if state_outs: B = 4 # For RNNs, have B=4, T=[depends on sample_batch_size] - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): sub_batch = postprocessed_batch[:B] postprocessed_batch["state_in"] = sub_batch["state_in"] postprocessed_batch["state_out"] = sub_batch["state_out"] @@ -1466,7 +1466,7 @@ def _initialize_loss_from_dummy_batch( seq_lens = np.array([seq_len for _ in range(B)], dtype=np.int32) postprocessed_batch[SampleBatch.SEQ_LENS] = seq_lens - if not self.config.get("_enable_new_api_stack"): + if not self.config.get("enable_rl_module_and_learner"): # Switch on lazy to-tensor conversion on `postprocessed_batch`. train_batch = self._lazy_tensor_dict(postprocessed_batch) # Calling loss, so set `is_training` to True. @@ -1506,7 +1506,7 @@ def _initialize_loss_from_dummy_batch( # Add new columns automatically to view-reqs. if ( - not self.config.get("_enable_new_api_stack") + not self.config.get("enable_rl_module_and_learner") and auto_remove_unneeded_view_reqs ): # Add those needed for postprocessing and training. @@ -1607,7 +1607,7 @@ def maybe_add_time_dimension( # We need to check for hasattr(self, "model") because a dummy Policy may not # have a model. if ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) and hasattr(self, "model") and self.model.is_stateful() ): diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 5277bc5c87b0..c13d0bbfd561 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -124,7 +124,7 @@ def do_test_log_likelihood( # The expected logp computation logic is overfitted to the ModelV2 # stack and does not generalize to RLModule API. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: expected_logp = _get_expected_logp( fw, vars, obs_batch, a, layer_key, logp_func ) @@ -142,7 +142,7 @@ def do_test_log_likelihood( in_training=False, ) - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(np.exp(logp), expected_prob, atol=0.2) diff --git a/rllib/policy/tests/test_export_checkpoint_and_model.py b/rllib/policy/tests/test_export_checkpoint_and_model.py index 3a54e60d605e..32eaa654e00f 100644 --- a/rllib/policy/tests/test_export_checkpoint_and_model.py +++ b/rllib/policy/tests/test_export_checkpoint_and_model.py @@ -30,7 +30,7 @@ def export_test( cls = get_trainable_cls(alg_name) config = cls.get_default_config() if alg_name in RLMODULE_SUPPORTED_ALGOS: - config = config.experimental(_enable_new_api_stack=False) + config = config.api_stack(enable_rl_module_and_learner=False) config.framework(framework) # Switch on saving native DL-framework (tf, torch) model files. config.checkpointing(export_native_model_files=True) diff --git a/rllib/policy/tests/test_policy.py b/rllib/policy/tests/test_policy.py index dfd182dfe37f..6bd09c6e8ff3 100644 --- a/rllib/policy/tests/test_policy.py +++ b/rllib/policy/tests/test_policy.py @@ -34,7 +34,7 @@ def test_policy_get_and_set_state(self): # Make sure everything is the same. # This is only supported without RLModule API. See AlgorithmConfig for # more info. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(state1["_exploration_state"], state3["_exploration_state"]) check(state1["global_timestep"], state3["global_timestep"]) check(state1["weights"], state3["weights"]) @@ -47,7 +47,7 @@ def test_policy_get_and_set_state(self): state4 = policy_restored_from_scratch.get_state() # This is only supported without RLModule API. See AlgorithmConfig for # more info. - if not config._enable_new_api_stack: + if not config.enable_rl_module_and_learner: check(state3["_exploration_state"], state4["_exploration_state"]) check(state3["global_timestep"], state4["global_timestep"]) # For tf static graph, the new model has different layer names diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index 9a1869a4b768..a7492e0e02e5 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -34,7 +34,7 @@ def __init__(self, lr, lr_schedule): self._lr_schedule = None # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. - if lr_schedule is None or self.config.get("_enable_new_api_stack", False): + if lr_schedule is None or self.config.get("enable_rl_module_and_learner", False): self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False) else: self._lr_schedule = PiecewiseSchedule( @@ -81,7 +81,7 @@ def __init__(self, entropy_coeff, entropy_coeff_schedule): # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. if entropy_coeff_schedule is None or ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) ): self.entropy_coeff = get_variable( entropy_coeff, framework="tf", tf_name="entropy_coeff", trainable=False @@ -214,7 +214,7 @@ class TargetNetworkMixin: """ def __init__(self): - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): model_vars = self.model.trainable_variables() target_model_vars = self.target_model.trainable_variables() @@ -244,7 +244,7 @@ def update_target_fn(tau): @property def q_func_vars(self): if not hasattr(self, "_q_func_vars"): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self._q_func_vars = self.model.variables else: self._q_func_vars = self.model.variables() @@ -253,7 +253,7 @@ def q_func_vars(self): @property def target_q_func_vars(self): if not hasattr(self, "_target_q_func_vars"): - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self._target_q_func_vars = self.target_model.variables else: self._target_q_func_vars = self.target_model.variables() @@ -265,7 +265,7 @@ def update_target(self, tau: int = None) -> None: @override(TFPolicy) def variables(self) -> List[TensorType]: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): return self.model.variables else: return self.model.variables() @@ -277,7 +277,7 @@ def set_weights(self, weights): EagerTFPolicyV2.set_weights(self, weights) elif isinstance(self, EagerTFPolicy): # Handle TF2 policies. EagerTFPolicy.set_weights(self, weights) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): self.update_target(self.config.get("tau", 1.0)) diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index 7a11ec13a408..c2343f8c315c 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -35,7 +35,7 @@ def __init__(self, lr, lr_schedule, lr2=None, lr2_schedule=None): @override(Policy) def on_global_var_update(self, global_vars): super().on_global_var_update(global_vars) - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): if self._lr_schedule: self.cur_lr = self._lr_schedule.value(global_vars["timestep"]) for opt in self._optimizers: @@ -58,7 +58,7 @@ def __init__(self, entropy_coeff, entropy_coeff_schedule): # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. if entropy_coeff_schedule is None or ( - self.config.get("_enable_new_api_stack", False) + self.config.get("enable_rl_module_and_learner", False) ): self.entropy_coeff = entropy_coeff else: @@ -210,7 +210,7 @@ def update_target(self, tau=None): # Support partial (soft) synching. # If tau == 1.0: Full sync from Q-model to target Q-model. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): target_current_network_pairs = self.model.get_target_network_pairs() for target_network, current_network in target_current_network_pairs: current_state_dict = current_network.state_dict() diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 31bce76119ab..8bb0b54685ef 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -739,7 +739,7 @@ def get_state(self) -> PolicyState: optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index c62d7e151965..08e7ec12b52c 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -86,7 +86,7 @@ def __init__( super().__init__(observation_space, action_space, config) # Create model. - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): model = self.make_rl_module() dist_class = None @@ -173,7 +173,7 @@ def __init__( self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(tree.flatten(self._state_inputs)) > 0 - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # Maybe update view_requirements, e.g. for recurrent case. self.view_requirements = self.model.update_default_view_requirements( self.view_requirements @@ -184,13 +184,13 @@ def __init__( # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # We don't need an exploration object with RLModules self.exploration = None else: self.exploration = self._create_exploration() - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): self._optimizers = force_list(self.optimizer()) # Backward compatibility workaround so Policy will call self.loss() @@ -250,7 +250,7 @@ def loss( Returns: Loss tensor given the input batch. """ - # Under the new _enable_new_api_stack the loss function still gets called in + # Under the new enable_rl_module_and_learner the loss function still gets called in # order to initialize the view requirements of the sample batches that are # returned by # the sampler. In this case, we don't actually want to compute any loss, however @@ -258,7 +258,7 @@ def loss( # sampler will include those keys in the sample batches it returns. This means # that the correct sample batch keys will be available when using the learner # group API. - if self.config._enable_new_api_stack: + if self.config.enable_rl_module_and_learner: for k in model.input_specs_train(): train_batch[k] return None @@ -327,10 +327,10 @@ def make_model(self) -> ModelV2: @override(Policy) def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): assert self.config.get( - "_enable_new_api_stack", False + "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("_enable_new_api_stack", False) and self.model.is_stateful(): + if self.config.get("enable_rl_module_and_learner", False) and self.model.is_stateful(): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -533,7 +533,7 @@ def compute_actions_from_input_dict( # Pass lazy (torch) tensor dict to Model as `input_dict`. input_dict = self._lazy_tensor_dict(input_dict) input_dict.set_training(True) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): return self._compute_action_helper( input_dict, state_batches=None, @@ -647,7 +647,7 @@ def compute_log_likelihoods( action_dist = dist_class(dist_inputs, self.model) # Default action-dist inputs calculation. else: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): if in_training: output = self.model.forward_train(input_dict) action_dist_cls = self.model.get_train_action_dist_cls() @@ -754,9 +754,9 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) self._lazy_tensor_dict(batch) @@ -781,9 +781,9 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) @@ -883,7 +883,7 @@ def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0): { LEARNER_STATS_KEY: self.stats_fn(batch), "model": {} - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else model.metrics(), NUM_GRAD_UPDATES_LIFETIME: self.num_grad_updates, # -1, b/c we have to measure this diff before we do the update @@ -911,9 +911,9 @@ def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("_enable_new_api_stack", False), + _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), padding="last" - if self.config.get("_enable_new_api_stack", False) + if self.config.get("enable_rl_module_and_learner", False) else "zero", ) @@ -992,7 +992,7 @@ def get_weights(self) -> ModelWeights: @override(Policy) def set_weights(self, weights: ModelWeights) -> None: weights = convert_to_torch_tensor(weights, device=self.device) - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): self.model.set_state(weights) else: self.model.load_state_dict(weights) @@ -1007,7 +1007,7 @@ def num_state_tensors(self) -> int: @override(Policy) def get_initial_state(self) -> List[TensorType]: - if self.config.get("_enable_new_api_stack", False): + if self.config.get("enable_rl_module_and_learner", False): # convert the tree of tensors to a tree to numpy arrays return tree.map_structure( lambda s: convert_to_numpy(s), self.model.get_initial_state() @@ -1023,12 +1023,12 @@ def get_state(self) -> PolicyState: state["_optimizer_variables"] = [] # In the new Learner API stack, the optimizers live in the learner. - if not self.config.get("_enable_new_api_stack", False): + if not self.config.get("enable_rl_module_and_learner", False): for i, o in enumerate(self._optimizers): optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("_enable_new_api_stack", False) and self.exploration: + if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() @@ -1074,7 +1074,7 @@ def export_model(self, export_dir: str, onnx: Optional[int] = None) -> None: os.makedirs(export_dir, exist_ok=True) - enable_rl_module = self.config.get("_enable_new_api_stack", False) + enable_rl_module = self.config.get("enable_rl_module_and_learner", False) if enable_rl_module and onnx: raise ValueError("ONNX export not supported for RLModule API.") diff --git a/rllib/tests/test_algorithm_rl_module_restore.py b/rllib/tests/test_algorithm_rl_module_restore.py index 0f1c2f616210..f5028f8826b5 100644 --- a/rllib/tests/test_algorithm_rl_module_restore.py +++ b/rllib/tests/test_algorithm_rl_module_restore.py @@ -50,7 +50,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) .environment(MultiAgentCartPole, env_config={"num_agents": num_agents}) .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) @@ -89,7 +89,7 @@ def test_e2e_load_simple_marl_module(self): module_specs=module_specs, load_state_path=marl_checkpoint_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) @@ -155,7 +155,7 @@ def test_e2e_load_complex_marl_module(self): module_specs=module_specs, load_state_path=marl_checkpoint_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) @@ -188,7 +188,7 @@ def test_e2e_load_rl_module(self): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners(rollout_fragment_length=4) .environment("CartPole-v1") .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) @@ -221,7 +221,7 @@ def test_e2e_load_rl_module(self): load_state_path=module_ckpt_path, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=module_to_load_spec, ) @@ -300,7 +300,7 @@ def test_e2e_load_complex_marl_module_with_modules_to_load(self): "policy_0", }, ) - config = config.experimental(_enable_new_api_stack=True).rl_module( + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( rl_module_spec=marl_module_spec_from_checkpoint, ) diff --git a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py index 02467b60858d..1e32be167df0 100644 --- a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py @@ -32,7 +32,7 @@ def save_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): The learner stats after 2 iterations of training. """ algo_cfg = ( - algo_cfg.experimental(_enable_new_api_stack=True) + algo_cfg.api_stack(enable_rl_module_and_learner=True) .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at @@ -68,7 +68,7 @@ def load_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): """ algo_cfg = ( - algo_cfg.experimental(_enable_new_api_stack=True) + algo_cfg.api_stack(enable_rl_module_and_learner=True) .env_runners(num_env_runners=0) # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 # to make sure that we get results as soon as sampling/training is done at diff --git a/rllib/tests/test_rllib_train_and_evaluate.py b/rllib/tests/test_rllib_train_and_evaluate.py index 3bbe33a16a5a..899dc6d4493e 100644 --- a/rllib/tests/test_rllib_train_and_evaluate.py +++ b/rllib/tests/test_rllib_train_and_evaluate.py @@ -96,7 +96,7 @@ def learn_test_plus_evaluate(algo: str, env="CartPole-v1"): # call rllib train here to see if the RLModule API is enabled. algo_cls = get_trainable_cls(algo) config = algo_cls.get_default_config() - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: eval_ = ', \\"evaluation_config\\": {}' else: eval_ = ', \\"evaluation_config\\": {\\"explore\\": false}' diff --git a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml index a11ecb312fe4..99a3e024a23c 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml @@ -6,7 +6,7 @@ cartpole-appo-w-rl-modules-and-learner: timesteps_total: 200000 config: # Run with Learner- and RLModule API (new stack). - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml index 4d412735d1c8..fa440ff61b46 100644 --- a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml +++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml @@ -8,7 +8,7 @@ appo-pongnoframeskip-v5: timesteps_total: 20000000 config: # Run with Learner- and RLModule API (new stack). - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Make analogous to old v4 + NoFrameskip. env_config: frameskip: 1 diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo.py b/rllib/tuned_examples/appo/stateless_cartpole_appo.py index f66f23649649..4a1f4d288157 100644 --- a/rllib/tuned_examples/appo/stateless_cartpole_appo.py +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo.py @@ -5,7 +5,7 @@ config = ( APPOConfig() # TODO: Switch over to new stack once it supports LSTMs. - .experimental(_enable_new_api_stack=False) + .api_stack(enable_rl_module_and_learner=False) .environment(StatelessCartPole) .resources(num_gpus=0) .env_runners(num_env_runners=1, observation_filter="MeanStdFilter") diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index c8e26ab4763f..70934b6aee8f 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -297,7 +297,7 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 8ee3937c13cc..33b3d4f6afa7 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -290,7 +290,7 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, diff --git a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py index bd2ec7602f07..18b9b43517cf 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py @@ -5,7 +5,7 @@ DQNConfig() .environment(env="CartPole-v1") .framework(framework="torch") - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, diff --git a/rllib/tuned_examples/impala/cartpole-impala.yaml b/rllib/tuned_examples/impala/cartpole-impala.yaml index 5fc12c5ccd21..249a8d8db420 100644 --- a/rllib/tuned_examples/impala/cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala.yaml @@ -5,7 +5,7 @@ cartpole-impala: sampler_results/episode_reward_mean: 150 timesteps_total: 500000 config: - _enable_new_api_stack: true + enable_rl_module_and_learner: true # Works for both torch and tf. framework: tf2 num_gpus: 0 diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index f05a63c55319..5674437d9ead 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -80,7 +80,7 @@ def stop_all(self): PPOConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, # Following the paper. diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index a7a21431872b..7d70009dba5e 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -70,7 +70,7 @@ PPOConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, diff --git a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py index c627fabc5a84..0042c05a3254 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py @@ -5,7 +5,7 @@ config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, num_env_runners=1, diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index bd9db73545e7..2848e045b044 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -5,7 +5,7 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) # Switch off np.random, which is known to have memory leaks. .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) .env_runners( diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py index a83f5aea1581..241cf8f79720 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py @@ -8,7 +8,7 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("multi_agent_pendulum") .env_runners( env_runner_cls=MultiAgentEnvRunner, diff --git a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py index 45f52388d325..2b909ab4dca6 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py @@ -5,7 +5,7 @@ config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( env_runner_cls=SingleAgentEnvRunner, num_env_runners=2, diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index 7f33a20b3913..d6f052fa251d 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -68,7 +68,7 @@ def stop_all(self): SACConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 414b94833a5e..24f6ddca08ad 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -58,7 +58,7 @@ SACConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( rollout_fragment_length="auto", env_runner_cls=SingleAgentEnvRunner, diff --git a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py index b4da28c927df..adc788667b69 100644 --- a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py +++ b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py @@ -4,7 +4,7 @@ config = ( SACConfig() # Enable new API stack and use EnvRunner. - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .env_runners( rollout_fragment_length=1, env_runner_cls=SingleAgentEnvRunner, diff --git a/rllib/utils/checkpoints.py b/rllib/utils/checkpoints.py index 193742f39bef..1c5a989ce117 100644 --- a/rllib/utils/checkpoints.py +++ b/rllib/utils/checkpoints.py @@ -223,7 +223,7 @@ def convert_to_msgpack_checkpoint( state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE # Add RLlib checkpoint version (as string). - if state["config"]["_enable_new_api_stack"]: + if state["config"]["enable_rl_module_and_learner"]: state["checkpoint_version"] = str(CHECKPOINT_VERSION_LEARNER) else: state["checkpoint_version"] = str(CHECKPOINT_VERSION) diff --git a/rllib/utils/debug/memory.py b/rllib/utils/debug/memory.py index ab4641521baf..ef7c6c68766a 100644 --- a/rllib/utils/debug/memory.py +++ b/rllib/utils/debug/memory.py @@ -112,7 +112,7 @@ def code(): results_per_category["policy"].extend(test) # Testing this only makes sense if the learner API is disabled. - if not policy.config.get("_enable_new_api_stack", False): + if not policy.config.get("enable_rl_module_and_learner", False): # Call `learn_on_batch()` n times. dummy_batch = policy._get_dummy_batch_from_view_requirements(batch_size=16) @@ -172,7 +172,7 @@ def code(): if test: results_per_category["rollout_worker"].extend(test) - if "learner" in to_check and algorithm.config.get("_enable_new_api_stack", False): + if "learner" in to_check and algorithm.config.get("enable_rl_module_and_learner", False): learner_group = algorithm.learner_group assert learner_group._is_local, ( "This test will miss leaks hidden in remote " diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index 8d15b74c751a..b0cb4a4ff915 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -22,7 +22,7 @@ def do_test_explorations(config, dummy_obs, prev_a=None, expected_mean_action=No for exploration in [None, "Random"]: local_config = config.copy() if exploration == "Random": - if local_config._enable_new_api_stack: + if local_config.enable_rl_module_and_learner: # TODO(Artur): Support Random exploration with RL Modules. continue local_config.env_runners(exploration_config={"type": "Random"}) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 661c44edebc9..e1cb759852ad 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -97,7 +97,7 @@ def add_rllib_example_script_args( parser.add_argument( "--enable-new-api-stack", action="store_true", - help="Whether to use the _enable_new_api_stack config setting.", + help="Whether to use the `enable_rl_module_and_learner` config setting.", ) parser.add_argument( "--framework", @@ -419,7 +419,7 @@ def _test( input_dict[SampleBatch.PREV_ACTIONS] = action_in input_dict[SampleBatch.PREV_REWARDS] = reward_in if state_in: - if what.config.get("_enable_new_api_stack", False): + if what.config.get("enable_rl_module_and_learner", False): input_dict["state_in"] = state_in else: for i, s in enumerate(state_in): @@ -813,7 +813,7 @@ def framework_iterator( for fw in frameworks: # Skip tf if on new API stack. - if fw == "tf" and config.get("_enable_new_api_stack", False): + if fw == "tf" and config.get("enable_rl_module_and_learner", False): logger.warning("Skipping `framework=tf` (new API stack configured)!") continue # Skip if tf/tf2 and py >= 3.11. @@ -1280,7 +1280,7 @@ def run_rllib_example_script_experiment( # Set the framework. base_config.framework(args.framework) # Enable the new API stack? - .experimental(_enable_new_api_stack=args.enable_new_api_stack) + .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) # Define EnvRunner/RolloutWorker scaling and behavior. .env_runners( num_env_runners=args.num_env_runners, @@ -1558,7 +1558,7 @@ def check_reproducibilty( # iterations). # As well as training behavior (minibatch sequence during SGD # iterations). - if algo_config._enable_new_api_stack: + if algo_config.enable_rl_module_and_learner: check( results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], @@ -1921,7 +1921,7 @@ def _do_check(alg, config, a_name, o_name): config_copy = config.copy() config_copy.validate() # If RLModules are enabled, we need to skip a few tests for now: - if config_copy._enable_new_api_stack: + if config_copy.enable_rl_module_and_learner: # Skip PPO cases in which RLModules don't support the given spaces yet. if o_name not in rlmodule_supported_observation_spaces: logger.warning( @@ -2000,7 +2000,7 @@ def _do_check(alg, config, a_name, o_name): if not frameworks: frameworks = ("tf2", "tf", "torch") - if config._enable_new_api_stack: + if config.enable_rl_module_and_learner: # Only test the frameworks that are supported by RLModules. frameworks = tuple( fw for fw in frameworks if fw in rlmodule_supported_frameworks From 1fa79e836b72afe8f0f92fc3e69657c536317594 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 25 Apr 2024 23:04:43 +0200 Subject: [PATCH 04/15] LINT Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 27 ++++++++++--------- rllib/algorithms/dqn/dqn.py | 3 ++- rllib/algorithms/impala/impala.py | 7 ++--- rllib/algorithms/ppo/ppo.py | 8 ++++-- rllib/algorithms/ppo/tests/test_ppo.py | 3 ++- rllib/algorithms/sac/sac.py | 4 ++- rllib/algorithms/tests/test_algorithm.py | 4 ++- .../algorithms/tests/test_algorithm_config.py | 5 ++-- rllib/connectors/agent/state_buffer.py | 4 ++- rllib/evaluate.py | 4 ++- rllib/evaluation/episode_v2.py | 4 ++- rllib/evaluation/rollout_worker.py | 10 +++---- rllib/evaluation/tests/test_rollout_worker.py | 4 +-- .../evaluation_parallel_to_training.py | 3 ++- .../ray_serve/ray_serve_with_rllib.py | 4 ++- rllib/policy/eager_tf_policy.py | 5 +++- rllib/policy/eager_tf_policy_v2.py | 9 ++++--- .../tests/test_policy_checkpoint_restore.py | 8 ++++-- rllib/policy/tf_mixins.py | 4 ++- rllib/policy/torch_policy.py | 5 +++- rllib/policy/torch_policy_v2.py | 26 +++++++++++++----- rllib/utils/debug/memory.py | 4 ++- 22 files changed, 102 insertions(+), 53 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 9f89297640d8..ef043775504f 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -33,7 +33,6 @@ from ray.rllib.env.wrappers.atari_wrappers import is_atari from ray.rllib.evaluation.collectors.sample_collector import SampleCollector from ray.rllib.evaluation.collectors.simple_list_collector import SimpleListCollector -from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.models import MODEL_DEFAULTS from ray.rllib.policy.policy import Policy, PolicySpec from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID @@ -614,7 +613,9 @@ def to_dict(self) -> AlgorithmConfigDict: config["custom_eval_function"] = config.pop("custom_evaluation_function", None) config["framework"] = config.pop("framework_str", None) config["num_cpus_for_driver"] = config.pop("num_cpus_for_local_worker", 1) - config["num_workers"] = config.pop("num_env_runners", config.pop("num_rollout_workers", 0)) + config["num_workers"] = config.pop( + "num_env_runners", config.pop("num_rollout_workers", 0) + ) # Simplify: Remove all deprecated keys that have as value `DEPRECATED_VALUE`. # These would be useless in the returned dict anyways. @@ -894,8 +895,8 @@ def build_env_to_module_connector(self, env): # Unsupported return value. else: raise ValueError( - "`AlgorithmConfig.env_runners(env_to_module_connector=..)` must return" - " a ConnectorV2 object or a list thereof (to be added to a " + "`AlgorithmConfig.env_runners(env_to_module_connector=..)` must " + "return a ConnectorV2 object or a list thereof (to be added to a " f"pipeline)! Your function returned {val_}." ) @@ -960,8 +961,8 @@ def build_module_to_env_connector(self, env): # Unsupported return value. else: raise ValueError( - "`AlgorithmConfig.env_runners(module_to_env_connector=..)` must return" - " a ConnectorV2 object or a list thereof (to be added to a " + "`AlgorithmConfig.env_runners(module_to_env_connector=..)` must " + "return a ConnectorV2 object or a list thereof (to be added to a " f"pipeline)! Your function returned {val_}." ) @@ -1409,7 +1410,6 @@ def framework( return self - def api_stack( self, enable_rl_module_and_learner: Optional[str] = NotProvided, @@ -1580,7 +1580,6 @@ def env_runners( rollout_fragment_length: Optional[Union[int, str]] = NotProvided, batch_mode: Optional[str] = NotProvided, explore: Optional[bool] = NotProvided, - # @OldAPIStack settings. exploration_config: Optional[dict] = NotProvided, # @OldAPIStack create_env_on_local_worker: Optional[bool] = NotProvided, # @OldAPIStack @@ -1592,7 +1591,6 @@ def env_runners( observation_filter: Optional[str] = NotProvided, # @OldAPIStack enable_tf1_exec_eagerly: Optional[bool] = NotProvided, # @OldAPIStack sampler_perf_stats_ema_coef: Optional[float] = NotProvided, # @OldAPIStack - # Deprecated args. num_rollout_workers=DEPRECATED_VALUE, num_envs_per_worker=DEPRECATED_VALUE, @@ -3133,7 +3131,8 @@ def get_evaluation_config_object( if self.evaluation_duration == "auto" else int( math.ceil( - self.evaluation_duration / (self.evaluation_num_env_runners or 1) + self.evaluation_duration + / (self.evaluation_num_env_runners or 1) ) ) ) @@ -3981,7 +3980,10 @@ def _validate_evaluation_settings(self): # `evaluation_parallel_to_training=True`, warn that you need # at least one remote eval worker for parallel training and # evaluation, and set `evaluation_parallel_to_training` to False. - if self.evaluation_num_env_runners == 0 and self.evaluation_parallel_to_training: + if ( + self.evaluation_num_env_runners == 0 + and self.evaluation_parallel_to_training + ): raise ValueError( "`evaluation_parallel_to_training` can only be done if " "`evaluation_num_env_runners` > 0! Try setting " @@ -4071,7 +4073,8 @@ def _validate_new_api_stack_settings(self): "API stack! Try setting " "`config.api_stack(enable_rl_module_and_learner=True)`." ) - # Early out. The rest of this method is only for enable_rl_module_and_learner=True. + # Early out. The rest of this method is only for + # `enable_rl_module_and_learner=True`. return # New API stack (RLModule, Learner APIs) only works with connectors. diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index 0f4806099c37..7a869d950aa5 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -432,7 +432,8 @@ def validate(self) -> None: raise ValueError( f"Your `rollout_fragment_length` ({self.rollout_fragment_length}) is " f"smaller than `n_step` ({self.n_step})! " - f"Try setting config.env_runners(rollout_fragment_length={self.n_step})." + "Try setting config.env_runners(rollout_fragment_length=" + f"{self.n_step})." ) # TODO (simon): Find a clean solution to deal with diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index df62c12f5346..c4ee2bfb60ec 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -981,8 +981,8 @@ def learn_on_processed_samples(self) -> ResultDict: def place_processed_samples_on_learner_thread_queue(self) -> None: """Place processed samples on the learner queue for training. - NOTE: This method is called if self.config.enable_rl_module_and_learner is False. - + NOTE: This method is called if self.config.enable_rl_module_and_learner is + False. """ for i, batch in enumerate(self.batches_to_place_on_learner): try: @@ -1008,7 +1008,8 @@ def place_processed_samples_on_learner_thread_queue(self) -> None: def process_trained_results(self) -> ResultDict: """Process training results that are outputed by the learner thread. - NOTE: This method is called if self.config.enable_rl_module_and_learner is False. + NOTE: This method is called if self.config.enable_rl_module_and_learner is + False. Returns: Aggregated results from the learner thread after an update is completed. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 857765937c8f..d30909dcfce1 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -434,7 +434,9 @@ def _training_step_new_api_stack(self) -> ResultDict: worker_set=self.workers, max_agent_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.enable_env_runner_and_connector_v2, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) else: @@ -442,7 +444,9 @@ def _training_step_new_api_stack(self) -> ResultDict: worker_set=self.workers, max_env_steps=self.config.total_train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.enable_env_runner_and_connector_v2, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) # Return early if all our workers failed. diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 914cc9b34bff..57988d3df6e8 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -272,7 +272,8 @@ def test_ppo_exploration_setup(self): .environment( "FrozenLake-v1", env_config={"is_slippery": False, "map_name": "4x4"}, - ).env_runners( + ) + .env_runners( # Run locally. num_env_runners=0, ) diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 1344652774f6..32489cdeb4b6 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -347,7 +347,9 @@ def validate(self) -> None: # Validate that we use the corresponding `EpisodeReplayBuffer` when using # episodes. # TODO (sven, simon): Implement the multi-agent case for replay buffers. - if self.enable_env_runner_and_connector_v2 and self.replay_buffer_config["type"] not in [ + if self.enable_env_runner_and_connector_v2 and self.replay_buffer_config[ + "type" + ] not in [ "EpisodeReplayBuffer", "PrioritizedEpisodeReplayBuffer", ]: diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index 865e63eaa37d..7da5ec811752 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -342,7 +342,9 @@ def test_space_inference_from_remote_workers(self): config = ( ppo.PPOConfig() - .env_runners(num_env_runners=1, validate_env_runners_after_construction=False) + .env_runners( + num_env_runners=1, validate_env_runners_after_construction=False + ) .environment(env="CartPole-v1") ) diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index 2ed25d5f0ed2..aed916921967 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -474,9 +474,8 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, but the MultiAgentRLModuleSpec has not defined its # SingleAgentRLmoduleSpecs. - config = ( - MultiAgentAlgoConfigWithNoSingleAgentSpec() - .api_stack(enable_rl_module_and_learner=True) + config = MultiAgentAlgoConfigWithNoSingleAgentSpec().api_stack( + enable_rl_module_and_learner=True ) self.assertRaisesRegex( diff --git a/rllib/connectors/agent/state_buffer.py b/rllib/connectors/agent/state_buffer.py index 97361f790373..aa3f8e94fb06 100644 --- a/rllib/connectors/agent/state_buffer.py +++ b/rllib/connectors/agent/state_buffer.py @@ -33,7 +33,9 @@ def __init__(self, ctx: ConnectorContext, states: Any = None): self._action_space_struct = get_base_struct_from_space(ctx.action_space) self._states = defaultdict(lambda: defaultdict(lambda: (None, None, None))) - self._enable_new_api_stack = ctx.config.get("enable_rl_module_and_learner", False) + self._enable_new_api_stack = ctx.config.get( + "enable_rl_module_and_learner", False + ) # TODO(jungong) : we would not need this if policies are never stashed # during the rollout of a single episode. if states: diff --git a/rllib/evaluate.py b/rllib/evaluate.py index 9076d0594da6..05f1fa7d4862 100755 --- a/rllib/evaluate.py +++ b/rllib/evaluate.py @@ -234,7 +234,9 @@ def run( env = config.get("env") # Make sure we have evaluation workers. - if not config.get("evaluation_num_workers", config.get("evaluation_num_env_runners")): + if not config.get( + "evaluation_num_workers", config.get("evaluation_num_env_runners") + ): config["evaluation_num_env_runners"] = config.get("num_workers", 0) if not config.get("evaluation_duration"): config["evaluation_duration"] = 1 diff --git a/rllib/evaluation/episode_v2.py b/rllib/evaluation/episode_v2.py index 03338bcc28ca..b4d15f94548c 100644 --- a/rllib/evaluation/episode_v2.py +++ b/rllib/evaluation/episode_v2.py @@ -191,7 +191,9 @@ def add_init_obs( ), is_policy_recurrent=policy.is_recurrent(), intial_states=policy.get_initial_state(), - _enable_new_api_stack=policy.config.get("enable_rl_module_and_learner", False), + _enable_new_api_stack=policy.config.get( + "enable_rl_module_and_learner", False + ), ) self._agent_collectors[agent_id].add_init_obs( episode_id=self.episode_id, diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 9177b6d3cae3..cad7795680bb 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -134,9 +134,9 @@ def _update_env_seed_if_necessary( # rollout workers. max_num_envs_per_env_runner: int = 1000 assert ( - worker_idx < max_num_envs_per_env_runners + worker_idx < max_num_envs_per_env_runner ), "Too many envs per worker. Random seeds may collide." - computed_seed: int = worker_idx * max_num_envs_per_env_runners + vector_idx + seed + computed_seed: int = worker_idx * max_num_envs_per_env_runner + vector_idx + seed # Gymnasium.env. # This will silently fail for most Farama-foundation gymnasium environments. @@ -1860,9 +1860,9 @@ def _build_policy_map( new_policy = policy # Maybe torch compile an RLModule. - if self.config.get("enable_rl_module_and_learner", False) and self.config.get( - "torch_compile_worker" - ): + if self.config.get( + "enable_rl_module_and_learner", False + ) and self.config.get("torch_compile_worker"): if self.config.framework_str != "torch": raise ValueError("Attempting to compile a non-torch RLModule.") rl_module = getattr(new_policy, "model", None) diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 340e9b279cc7..254e515e3133 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -866,9 +866,7 @@ def test_no_env_seed(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(20, mocked_num_envs=8), default_policy_class=MockPolicy, - config=AlgorithmConfig() - .env_runners(num_env_runners=0) - .debugging(seed=1), + config=AlgorithmConfig().env_runners(num_env_runners=0).debugging(seed=1), ) assert not hasattr(ev.env, "seed") ev.stop() diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index fc557cd52dbb..d5d035282a03 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -152,7 +152,8 @@ def on_train_result(self, *, algorithm: Algorithm, result: ResultDict, **kwargs) # fetch. assert ( num_timesteps_reported == 0 - or num_timesteps_reported >= algorithm.config.evaluation_num_env_runners + or num_timesteps_reported + >= algorithm.config.evaluation_num_env_runners ) # We count in episodes. elif algorithm.config.evaluation_duration_unit == "episodes": diff --git a/rllib/examples/ray_serve/ray_serve_with_rllib.py b/rllib/examples/ray_serve/ray_serve_with_rllib.py index 3a73813eec37..6001865a5544 100644 --- a/rllib/examples/ray_serve/ray_serve_with_rllib.py +++ b/rllib/examples/ray_serve/ray_serve_with_rllib.py @@ -68,7 +68,9 @@ def kill_proc(proc): # Config for the served RLlib RLModule/Algorithm. config = ( - PPOConfig().api_stack(enable_rl_module_and_learner=True).environment("CartPole-v1") + PPOConfig() + .api_stack(enable_rl_module_and_learner=True) + .environment("CartPole-v1") ) # Train the Algorithm for some time, then save it and get the checkpoint path. diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 1edce9f42dce..e3d7f93bf67c 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -754,7 +754,10 @@ def get_state(self) -> PolicyState: if self._optimizer and len(self._optimizer.variables()) > 0: state["_optimizer_variables"] = self._optimizer.variables() # Add exploration state. - if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index 25858d7eed91..1a14520f1d11 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -161,7 +161,10 @@ def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("enable_rl_module_and_learner", False) and self.model.is_stateful(): + if ( + self.config.get("enable_rl_module_and_learner", False) + and self.model.is_stateful() + ): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -213,8 +216,8 @@ def loss( Returns: A single loss tensor or a list of loss tensors. """ - # Under the new enable_rl_module_and_learner the loss function still gets called in - # order to initialize the view requirements of the sample batches that are + # Under the new enable_rl_module_and_learner the loss function still gets called + # in order to initialize the view requirements of the sample batches that are # returned by the sampler. In this case, we don't actually want to compute any # loss, however # if we access the keys that are needed for a forward_train pass, then the diff --git a/rllib/policy/tests/test_policy_checkpoint_restore.py b/rllib/policy/tests/test_policy_checkpoint_restore.py index dc9f22e3c3ba..cc7598dc7710 100644 --- a/rllib/policy/tests/test_policy_checkpoint_restore.py +++ b/rllib/policy/tests/test_policy_checkpoint_restore.py @@ -16,7 +16,9 @@ def _do_checkpoint_twice_test(framework): # Checks if we can load a policy from a checkpoint (at least) twice config = ( - PPOConfig().env_runners(num_env_runners=0).evaluation(evaluation_num_env_runners=0) + PPOConfig() + .env_runners(num_env_runners=0) + .evaluation(evaluation_num_env_runners=0) ) for fw in framework_iterator(config, frameworks=[framework]): algo1 = config.build(env="CartPole-v1") @@ -60,7 +62,9 @@ def test_policy_from_checkpoint_twice_torch(self): def test_add_policy_connector_enabled(self): with tempfile.TemporaryDirectory() as tmpdir: config = ( - APPOConfig().environment("CartPole-v1").env_runners(enable_connectors=True) + APPOConfig() + .environment("CartPole-v1") + .env_runners(enable_connectors=True) ) algo = config.build() algo.train() diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index a7492e0e02e5..e4e88aa00785 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -34,7 +34,9 @@ def __init__(self, lr, lr_schedule): self._lr_schedule = None # Disable any scheduling behavior related to learning if Learner API is active. # Schedules are handled by Learner class. - if lr_schedule is None or self.config.get("enable_rl_module_and_learner", False): + if lr_schedule is None or self.config.get( + "enable_rl_module_and_learner", False + ): self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False) else: self._lr_schedule = PiecewiseSchedule( diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 8bb0b54685ef..6d53b78da360 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -739,7 +739,10 @@ def get_state(self) -> PolicyState: optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 08e7ec12b52c..5a52cdfd32bc 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -250,8 +250,8 @@ def loss( Returns: Loss tensor given the input batch. """ - # Under the new enable_rl_module_and_learner the loss function still gets called in - # order to initialize the view requirements of the sample batches that are + # Under the new enable_rl_module_and_learner the loss function still gets called + # in order to initialize the view requirements of the sample batches that are # returned by # the sampler. In this case, we don't actually want to compute any loss, however # if we access the keys that are needed for a forward_train pass, then the @@ -330,7 +330,10 @@ def maybe_remove_time_dimension(self, input_dict: Dict[str, TensorType]): "enable_rl_module_and_learner", False ), "This is a helper method for the new learner API." - if self.config.get("enable_rl_module_and_learner", False) and self.model.is_stateful(): + if ( + self.config.get("enable_rl_module_and_learner", False) + and self.model.is_stateful() + ): # Note that this is a temporary workaround to fit the old sampling stack # to RL Modules. ret = {} @@ -754,7 +757,9 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" if self.config.get("enable_rl_module_and_learner", False) else "zero", @@ -781,7 +786,9 @@ def load_batch_into_buffer( shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" if self.config.get("enable_rl_module_and_learner", False) else "zero", @@ -911,7 +918,9 @@ def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, - _enable_new_api_stack=self.config.get("enable_rl_module_and_learner", False), + _enable_new_api_stack=self.config.get( + "enable_rl_module_and_learner", False + ), padding="last" if self.config.get("enable_rl_module_and_learner", False) else "zero", @@ -1028,7 +1037,10 @@ def get_state(self) -> PolicyState: optim_state_dict = convert_to_numpy(o.state_dict()) state["_optimizer_variables"].append(optim_state_dict) # Add exploration state. - if not self.config.get("enable_rl_module_and_learner", False) and self.exploration: + if ( + not self.config.get("enable_rl_module_and_learner", False) + and self.exploration + ): # This is not compatible with RLModules, which have a method # `forward_exploration` to specify custom exploration behavior. state["_exploration_state"] = self.exploration.get_state() diff --git a/rllib/utils/debug/memory.py b/rllib/utils/debug/memory.py index ef7c6c68766a..5f7944c08177 100644 --- a/rllib/utils/debug/memory.py +++ b/rllib/utils/debug/memory.py @@ -172,7 +172,9 @@ def code(): if test: results_per_category["rollout_worker"].extend(test) - if "learner" in to_check and algorithm.config.get("enable_rl_module_and_learner", False): + if "learner" in to_check and algorithm.config.get( + "enable_rl_module_and_learner", False + ): learner_group = algorithm.learner_group assert learner_group._is_local, ( "This test will miss leaks hidden in remote " From 2033dbd4880bb8b3243c4f20b854365347626b25 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 25 Apr 2024 23:12:32 +0200 Subject: [PATCH 05/15] docs Signed-off-by: sven1977 --- doc/source/rllib/doc_code/catalog_guide.py | 2 +- doc/source/rllib/doc_code/new_api_stack.py | 61 +++++++++------------ doc/source/rllib/doc_code/rlmodule_guide.py | 8 +-- doc/source/rllib/rllib-learner.rst | 4 +- doc/source/rllib/rllib-rlmodule.rst | 2 +- rllib/evaluation/worker_set.py | 14 ++++- 6 files changed, 46 insertions(+), 45 deletions(-) diff --git a/doc/source/rllib/doc_code/catalog_guide.py b/doc/source/rllib/doc_code/catalog_guide.py index 1a92fccb83e1..6a9a5ef1f083 100644 --- a/doc/source/rllib/doc_code/catalog_guide.py +++ b/doc/source/rllib/doc_code/catalog_guide.py @@ -113,7 +113,7 @@ def __init__(self, *args, **kwargs): config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .framework("torch") ) diff --git a/doc/source/rllib/doc_code/new_api_stack.py b/doc/source/rllib/doc_code/new_api_stack.py index 597922bb48df..fbe485c5b665 100644 --- a/doc/source/rllib/doc_code/new_api_stack.py +++ b/doc/source/rllib/doc_code/new_api_stack.py @@ -1,22 +1,19 @@ # __enabling-new-api-stack-sa-ppo-begin__ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig().environment("CartPole-v1") - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and - # `MultiAgentEnvRunner` for multi-agent cases. - .env_runners(env_runner_cls=SingleAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -43,7 +40,6 @@ # __enabling-new-api-stack-ma-ppo-begin__ from ray.rllib.algorithms.ppo import PPOConfig # noqa -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner # noqa from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole # noqa @@ -51,17 +47,15 @@ # looks like this. config = ( PPOConfig().environment(MultiAgentCartPole, env_config={"num_agents": 2}) - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - # Set the `env_runner_cls` to `SingleAgentEnvRunner` for single-agent setups and - # `MultiAgentEnvRunner` for multi-agent cases. - .env_runners(env_runner_cls=MultiAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and @@ -95,20 +89,19 @@ # __enabling-new-api-stack-sa-sac-begin__ from ray.rllib.algorithms.sac import SACConfig # noqa -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner # noqa config = ( SACConfig().environment("Pendulum-v1") - # Switch the new API stack flag to True (False by default). - # This enables the use of the RLModule (replaces ModelV2) AND Learner (replaces - # Policy) classes. - .experimental(_enable_new_api_stack=True) - # However, the above flag only activates the RLModule and Learner APIs. In order - # to utilize all of the new API stack's classes, you also have to specify the - # EnvRunner (replaces RolloutWorker) to use. - # Note that this step will be fully automated in the next release. - .env_runners(env_runner_cls=SingleAgentEnvRunner) + # Switch both the new API stack flags to True (both False by default). + # This enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy) + # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) + # and enables ConnectorV2 support. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # We are using a simple 1-CPU setup here for learning. However, as the new stack # supports arbitrary scaling on the learner axis, feel free to set # `num_learner_workers` to the number of available GPUs for multi-GPU training (and diff --git a/doc/source/rllib/doc_code/rlmodule_guide.py b/doc/source/rllib/doc_code/rlmodule_guide.py index f00388818413..4a2342b3b10e 100644 --- a/doc/source/rllib/doc_code/rlmodule_guide.py +++ b/doc/source/rllib/doc_code/rlmodule_guide.py @@ -12,7 +12,7 @@ config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .framework("torch") .environment("CartPole-v1") ) @@ -80,7 +80,7 @@ config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment("CartPole-v1") .rl_module( model_config_dict={"fcnet_hiddens": [32, 32]}, @@ -103,7 +103,7 @@ config = ( BCConfigTest() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .environment(MultiAgentCartPole, env_config={"num_agents": 2}) .rl_module( model_config_dict={"fcnet_hiddens": [32, 32]}, @@ -406,7 +406,7 @@ def setup(self): config = ( PPOConfig() # Enable the new API stack (RLModule and Learner APIs). - .experimental(_enable_new_api_stack=True).environment("CartPole-v1") + .api_stack(enable_rl_module_and_learner=True).environment("CartPole-v1") ) env = gym.make("CartPole-v1") # Create an RL Module that we would like to checkpoint diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 8900783235f1..d9c4e846b226 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -58,7 +58,7 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf config = ( PPOConfig() - .experimental(_enable_new_api_stack=True) + .api_stack(enable_rl_module_and_learner=True) .resources( num_gpus_per_learner_worker=0, # Set this to 1 to enable GPU training. num_cpus_per_learner_worker=1, @@ -77,7 +77,7 @@ arguments in the :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConf .. note:: This features is in alpha. If you migrate to this algorithm, enable the feature by - via `AlgorithmConfig.experimental(_enable_new_api_stack=True)`. + via `AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)`. The following algorithms support :py:class:`~ray.rllib.core.learner.learner.Learner` out of the box. Implement an algorithm with a custom :py:class:`~ray.rllib.core.learner.learner.Learner` to leverage this API for other algorithms. diff --git a/doc/source/rllib/rllib-rlmodule.rst b/doc/source/rllib/rllib-rlmodule.rst index ece872bc6188..ff6d16a3626d 100644 --- a/doc/source/rllib/rllib-rlmodule.rst +++ b/doc/source/rllib/rllib-rlmodule.rst @@ -64,7 +64,7 @@ RL Module is a neural network container that implements three public methods: :p Enabling RL Modules in the Configuration ---------------------------------------- -Enable RL Modules via our configuration object: ``AlgorithmConfig.experimental(_enable_new_api_stack=True)``. +Enable RL Modules via our configuration object: ``AlgorithmConfig.api_stack(enable_rl_module_and_learner=True)``. .. literalinclude:: doc_code/rlmodule_guide.py :language: python diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index e07d67c05d14..f627b5ace3b0 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -26,6 +26,8 @@ from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.env_context import EnvContext from ray.rllib.env.env_runner import EnvRunner +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.offline import get_dataset_and_shards from ray.rllib.policy.policy import Policy, PolicyState from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID @@ -138,9 +140,15 @@ def __init__( } # Set the EnvRunner subclass to be used as "workers". Default: RolloutWorker. - self.env_runner_cls = ( - RolloutWorker if config.env_runner_cls is None else config.env_runner_cls - ) + self.env_runner_cls = config.env_runner_cls + if self.env_runner_cls is None: + if config.enable_env_runner_and_connector_v2: + if config.is_multi_agent(): + self.env_runner_cls = MultiAgentEnvRunner + else: + self.env_runner_cls = SingleAgentEnvRunner + else: + self.env_runner_cls = RolloutWorker self._cls = ray.remote(**self._remote_args)(self.env_runner_cls).remote self._logdir = logdir From 3642334c4b952903dd5e6d32044cbacb098f860a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 10:28:36 +0200 Subject: [PATCH 06/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 10 +-- .../ppo/tests/test_ppo_with_env_runner.py | 9 ++- .../tests/test_callbacks_on_env_runner.py | 21 +++--- .../algorithms/tests/test_worker_failures.py | 69 ++++++++++++++----- .../env/tests/test_multi_agent_env_runner.py | 6 +- .../evaluation/tests/test_envs_that_crash.py | 1 - .../self_play_league_based_with_open_spiel.py | 15 ++-- .../multi_agent/self_play_with_open_spiel.py | 11 ++- rllib/examples/ray_tune/custom_experiment.py | 11 ++- rllib/examples/ray_tune/custom_logger.py | 7 +- .../ray_tune/custom_progress_reporter.py | 7 +- .../tuned_examples/dqn/benchmark_dqn_atari.py | 7 +- ...benchmark_dqn_atari_rllib_preprocessing.py | 7 +- .../dqn/cartpole_dqn_envrunner.py | 9 ++- .../ppo/benchmark_ppo_mujoco.py | 9 +-- .../ppo/benchmark_ppo_mujoco_pb2.py | 9 +-- .../ppo/cartpole_ppo_envrunner.py | 9 ++- .../ppo/memory_leak_test_ppo_new_stack.py | 7 +- .../ppo/multi_agent_pendulum_ppo_envrunner.py | 7 +- .../ppo/pendulum_ppo_envrunner.py | 7 +- .../sac/benchmark_sac_mujoco.py | 9 +-- .../sac/benchmark_sac_mujoco_pb2.py | 9 +-- .../sac/pendulum_sac_envrunner.py | 7 +- rllib/utils/test_utils.py | 23 ++----- 24 files changed, 158 insertions(+), 128 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index ef043775504f..1af59e78d8f2 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -2753,8 +2753,8 @@ def fault_tolerance( delay_between_env_runner_restarts_s: Optional[float] = NotProvided, restart_failed_sub_environments: Optional[bool] = NotProvided, num_consecutive_env_runner_failures_tolerance: Optional[int] = NotProvided, - env_runner_health_probe_timeout_s: int = NotProvided, - env_runner_restore_timeout_s: int = NotProvided, + env_runner_health_probe_timeout_s: Optional[float] = NotProvided, + env_runner_restore_timeout_s: Optional[float] = NotProvided, # Deprecated args. recreate_failed_workers=DEPRECATED_VALUE, max_num_worker_restarts=DEPRECATED_VALUE, @@ -2790,9 +2790,9 @@ def fault_tolerance( failures, the EnvRunner itself is NOT affected and won't throw any errors as the flawed sub-environment is silently restarted under the hood. - env_runner_health_probe_timeout_s: Max amount of time we should spend - waiting for health probe calls to finish. Health pings are very cheap, - so the default is 1 minute. + env_runner_health_probe_timeout_s: Max amount of time (in seconds) we should + spend waiting for health probe calls to finish. Health pings are very + cheap, so the default is 1 minute. env_runner_restore_timeout_s: Max amount of time we should wait to restore states on recovered EnvRunner actors. Default is 30 mins. diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index fc1a180477f2..09ff35ac9eaf 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -11,7 +11,6 @@ LEARNER_RESULTS_CURR_LR_KEY, ) -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.metrics import LEARNER_RESULTS from ray.rllib.utils.test_utils import ( @@ -75,11 +74,11 @@ def test_ppo_compilation_and_schedule_mixins(self): config = ( ppo.PPOConfig() # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=0) .training( num_sgd_iter=2, # Setup lr schedule for testing lr-scheduling correctness. diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 7b7c66ad96a3..6848503933b9 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -4,7 +4,6 @@ import ray from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.utils.test_utils import framework_iterator @@ -72,12 +71,14 @@ def tearDownClass(cls): def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") .env_runners( - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", - env_runner_cls=SingleAgentEnvRunner, ) .callbacks(EpisodeAndSampleCallbacks) .training( @@ -115,11 +116,13 @@ def test_episode_and_sample_callbacks_batch_mode_truncate_episodes(self): def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") .env_runners( batch_mode="complete_episodes", - env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=0, ) .callbacks(EpisodeAndSampleCallbacks) @@ -158,8 +161,10 @@ def test_overriding_on_episode_created_throws_error_on_new_api_stack(self): """Tests, whw""" config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .env_runners(env_runner_cls=SingleAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .callbacks(OnEpisodeCreatedCallback) ) self.assertRaises(ValueError, lambda: config.validate()) diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index 9a0e712b3f91..96e951119d69 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -389,9 +389,11 @@ def test_fatal_single_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, env_to_module_connector=lambda env: FlattenObservations(), ) ) @@ -400,8 +402,10 @@ def test_fatal_multi_agent(self): # Test the case where all workers fail (w/o recovery). self._do_test_failing_fatal( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .env_runners(env_runner_cls=MultiAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .multi_agent(policies={"p0"}, policy_mapping_fn=lambda *a, **k: "p0"), ) @@ -409,7 +413,10 @@ def test_fatal_multi_agent(self): # def test_async_samples(self): # self._do_test_fault_ignore( # ImpalaConfig() - # .api_stack(enable_rl_module_and_learner=True) + # .api_stack( + # enable_rl_module_and_learner=True, + # enable_env_runners_and_connector_v2=True, + # ) # .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) # .resources(num_gpus=0) # ) @@ -417,7 +424,10 @@ def test_fatal_multi_agent(self): def test_sync_replay(self): self._do_test_failing_ignore( SACConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment( env_config={"action_space": gym.spaces.Box(0, 1, (2,), np.float32)} ) @@ -429,10 +439,11 @@ def test_sync_replay(self): def test_multi_gpu(self): self._do_test_failing_ignore( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .env_runners( - env_runner_cls=ForwardHealthCheckToEnvWorker, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training( train_batch_size=10, sgd_minibatch_size=1, @@ -443,7 +454,10 @@ def test_multi_gpu(self): def test_sync_samples(self): self._do_test_failing_ignore( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(optimizer={}) ) @@ -452,7 +466,10 @@ def test_eval_workers_failing_ignore(self): # Test the case where one eval worker fails, but we chose to ignore. self._do_test_failing_ignore( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .training(model={"fcnet_hiddens": [4]}), fail_eval=True, @@ -462,7 +479,10 @@ def test_eval_workers_parallel_to_training_failing_recover(self): # Test the case where all eval workers fail, but we chose to recover. config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorker) .evaluation( evaluation_num_env_runners=1, @@ -482,7 +502,10 @@ def test_eval_workers_parallel_to_training_multi_agent_failing_recover( # to recover. config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners(env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent) .multi_agent( policies={"main", "p0", "p1"}, @@ -518,7 +541,10 @@ def test_workers_failing_recover(self): config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -574,7 +600,10 @@ def test_modules_are_restored_on_recovered_worker(self): config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorkerMultiAgent, num_env_runners=2, @@ -678,7 +707,10 @@ def test_eval_workers_failing_recover(self): config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, @@ -746,7 +778,10 @@ def test_worker_failing_recover_with_hanging_workers(self): # the execution of the algorithm b/c of a single heavily stalling worker. # Timeout data (batches or episodes) are discarded. SACConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .training( replay_buffer_config={"type": "EpisodeReplayBuffer"}, ) diff --git a/rllib/env/tests/test_multi_agent_env_runner.py b/rllib/env/tests/test_multi_agent_env_runner.py index 5cb1a889feff..1f7f51243afb 100644 --- a/rllib/env/tests/test_multi_agent_env_runner.py +++ b/rllib/env/tests/test_multi_agent_env_runner.py @@ -95,12 +95,14 @@ def _build_config(self): # Build the configuration and use `PPO`. config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment( MultiAgentCartPole, env_config={"num_agents": 2}, ) - .env_runners(env_runner_cls=MultiAgentEnvRunner) # TODO (sven, simon): Setup is still for `Policy`, change as soon # as we have switched fully to the new stack. .multi_agent( diff --git a/rllib/evaluation/tests/test_envs_that_crash.py b/rllib/evaluation/tests/test_envs_that_crash.py index eb8fd6a8c346..e546601d2bf3 100644 --- a/rllib/evaluation/tests/test_envs_that_crash.py +++ b/rllib/evaluation/tests/test_envs_that_crash.py @@ -121,7 +121,6 @@ def test_env_crash_on_one_worker_during_sampling_but_recreate_worker(self): PPOConfig() .api_stack(enable_rl_module_and_learner=True) .env_runners( - # env_runner_cls=ForwardHealthCheckToEnvWorker, num_env_runners=2, rollout_fragment_length=10, num_envs_per_env_runner=3, diff --git a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py index 1a83f25a1335..046613a49a27 100644 --- a/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_league_based_with_open_spiel.py @@ -36,7 +36,6 @@ import ray from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv from ray.rllib.examples.multi_agent.utils import ( @@ -160,7 +159,10 @@ def _get_multi_agent(): get_trainable_cls(args.algo) .get_default_config() # Use new API stack ... - .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play @@ -175,13 +177,8 @@ def _get_multi_agent(): ) ) .env_runners( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), + num_env_runners=args.num_env_runners, + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) .resources( num_learner_workers=args.num_gpus, diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index ff2963bb1670..2c8cb85e0e54 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -24,7 +24,6 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.env.utils import try_import_pyspiel, try_import_open_spiel from ray.rllib.env.wrappers.open_spiel import OpenSpielEnv from ray.rllib.examples.rl_modules.classes.random_rlm import RandomRLModule @@ -106,7 +105,10 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config = ( get_trainable_cls(args.algo) .get_default_config() - .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, + ) .environment("open_spiel_env") .framework(args.framework) # Set up the main piece in this experiment: The league-bases self-play @@ -125,11 +127,6 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): .env_runners( num_rollout_workers=args.num_env_runners, num_envs_per_worker=1 if args.enable_new_api_stack else 5, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None if not args.enable_new_api_stack else MultiAgentEnvRunner - ), ) .resources( num_learner_workers=args.num_gpus, diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 0ddb58edb48c..5e0e441eeac0 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -43,7 +43,6 @@ import numpy as np from ray import train, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.utils.framework import try_import_torch torch, _ = try_import_torch() @@ -155,12 +154,12 @@ def my_experiment(config: Dict): if __name__ == "__main__": base_config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .environment("CartPole-v1") - .env_runners( - num_rollout_workers=0, - env_runner_cls=SingleAgentEnvRunner, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .environment("CartPole-v1") + .env_runners(num_env_runners=0) ) # Convert to a plain dict for Tune. Note that this is usually not needed, you can # pass into the below Tune Tuner any instantiated RLlib AlgorithmConfig object. diff --git a/rllib/examples/ray_tune/custom_logger.py b/rllib/examples/ray_tune/custom_logger.py index 2644b38fb4e6..9ec3af15a2ae 100644 --- a/rllib/examples/ray_tune/custom_logger.py +++ b/rllib/examples/ray_tune/custom_logger.py @@ -52,7 +52,6 @@ from ray import air, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.logger import Logger, LegacyLoggerCallback @@ -83,8 +82,10 @@ def flush(self): if __name__ == "__main__": config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .env_runners(env_runner_cls=SingleAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") # Setting up a custom logger config. # ---------------------------------- diff --git a/rllib/examples/ray_tune/custom_progress_reporter.py b/rllib/examples/ray_tune/custom_progress_reporter.py index b9d085c4c58c..57bb64dff8f6 100644 --- a/rllib/examples/ray_tune/custom_progress_reporter.py +++ b/rllib/examples/ray_tune/custom_progress_reporter.py @@ -45,7 +45,6 @@ """ from ray import air, tune from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole @@ -89,8 +88,10 @@ config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) - .env_runners(env_runner_cls=MultiAgentEnvRunner) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("env") .multi_agent( # Define 3 policies. Note that in our simple setup, they are all configured diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index 70934b6aee8f..52f4c658f7f8 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -4,7 +4,6 @@ from ray.rllib.algorithms.dqn.dqn import DQNConfig from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -297,11 +296,13 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=1, env_to_module_connector=_make_env_to_module_connector, ) diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 33b3d4f6afa7..236c376b0414 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -1,7 +1,6 @@ import gymnasium as gym from ray.rllib.algorithms.dqn.dqn import DQNConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack from ray.tune import Stopper from ray import train, tune @@ -290,11 +289,13 @@ def stop_all(self): clip_rewards=True, ) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - env_runner_cls=SingleAgentEnvRunner, num_rollout_workers=1, ) .resources( diff --git a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py index 00f61f76142a..3c3d4ff6b4d6 100644 --- a/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py +++ b/rllib/tuned_examples/dqn/cartpole_dqn_envrunner.py @@ -1,15 +1,14 @@ from ray.rllib.algorithms.dqn import DQNConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( DQNConfig() .environment(env="CartPole-v1") .framework(framework="torch") - .api_stack(enable_rl_module_and_learner=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=0) .resources( num_learner_workers=0, ) diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py index 5674437d9ead..f6d53f61ba5b 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -80,11 +79,13 @@ def stop_all(self): PPOConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, # Following the paper. - num_rollout_workers=32, + num_env_runners=32, rollout_fragment_length=512, ) .resources( diff --git a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py index 7d70009dba5e..3683c0fb2a38 100644 --- a/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py +++ b/rllib/tuned_examples/ppo/benchmark_ppo_mujoco_pb2.py @@ -1,6 +1,5 @@ import time from ray.rllib.algorithms.ppo.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.schedulers.pb2 import PB2 from ray import train, tune @@ -70,11 +69,13 @@ PPOConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=num_rollout_workers, + num_env_runners=num_rollout_workers, # TODO (sven, simon): Add resources. ) .resources( diff --git a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py index 0ca4ba3155b0..dd1282df1a60 100644 --- a/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/cartpole_ppo_envrunner.py @@ -1,15 +1,14 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) - .env_runners( - env_runner_cls=SingleAgentEnvRunner, - num_env_runners=1, + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, ) + .env_runners(num_env_runners=1) .environment("CartPole-v1") .rl_module( model_config_dict={ diff --git a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py index 2848e045b044..65c6d3dc4261 100644 --- a/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py +++ b/rllib/tuned_examples/ppo/memory_leak_test_ppo_new_stack.py @@ -1,15 +1,16 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.examples.envs.classes.random_env import RandomLargeObsSpaceEnv config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) # Switch off np.random, which is known to have memory leaks. .environment(RandomLargeObsSpaceEnv, env_config={"static_samples": True}) .env_runners( - env_runner_cls=SingleAgentEnvRunner, num_env_runners=4, num_envs_per_env_runner=5, ) diff --git a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py index 743fb5625f0d..3c9e2224ecab 100644 --- a/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/multi_agent_pendulum_ppo_envrunner.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum from ray.tune.registry import register_env @@ -8,10 +7,12 @@ config = ( PPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("multi_agent_pendulum") .env_runners( - env_runner_cls=MultiAgentEnvRunner, num_envs_per_env_runner=1, num_env_runners=2, ) diff --git a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py index 816ea97f1e6a..7d2d03c415cd 100644 --- a/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py +++ b/rllib/tuned_examples/ppo/pendulum_ppo_envrunner.py @@ -1,13 +1,14 @@ from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( PPOConfig() # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( - env_runner_cls=SingleAgentEnvRunner, num_env_runners=2, num_envs_per_env_runner=20, ) diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py index d6f052fa251d..9579810f647d 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco.py @@ -1,5 +1,4 @@ from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune import Stopper from ray import train, tune @@ -68,11 +67,13 @@ def stop_all(self): SACConfig() .environment(env=tune.grid_search(list(benchmark_envs.keys()))) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=0, + num_env_runners=0, ) .resources( # Note, we have a sample/train ratio of 1:1 and a small train diff --git a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py index 24f6ddca08ad..0d73a0816bc4 100644 --- a/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py +++ b/rllib/tuned_examples/sac/benchmark_sac_mujoco_pb2.py @@ -1,6 +1,5 @@ import time from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.tune.schedulers.pb2 import PB2 from ray import train, tune @@ -58,11 +57,13 @@ SACConfig() .environment(env=env) # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length="auto", - env_runner_cls=SingleAgentEnvRunner, - num_rollout_workers=1, + num_env_runners=1, # TODO (sven, simon): Add resources. ) .resources( diff --git a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py index d38741955836..2b04c62b099c 100644 --- a/rllib/tuned_examples/sac/pendulum_sac_envrunner.py +++ b/rllib/tuned_examples/sac/pendulum_sac_envrunner.py @@ -1,13 +1,14 @@ from ray.rllib.algorithms.sac.sac import SACConfig -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner config = ( SACConfig() # Enable new API stack and use EnvRunner. - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .env_runners( rollout_fragment_length=1, - env_runner_cls=SingleAgentEnvRunner, num_env_runners=0, ) .environment(env="Pendulum-v1") diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 46ca03db4b05..88c23abb2b2c 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1355,30 +1355,17 @@ def run_rllib_example_script_experiment( "training_iteration": args.stop_iters, } - from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner - from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner - # Enhance the `base_config`, based on provided `args`. config = ( # Set the framework. base_config.framework(args.framework) # Enable the new API stack? - .api_stack(enable_rl_module_and_learner=args.enable_new_api_stack) - # Define EnvRunner/RolloutWorker scaling and behavior. - .env_runners( - num_env_runners=args.num_env_runners, - # Set up the correct env-runner to use depending on - # old-stack/new-stack and multi-agent settings. - env_runner_cls=( - None - if not args.enable_new_api_stack - else ( - SingleAgentEnvRunner - if args.num_agents == 0 - else MultiAgentEnvRunner - ) - ), + .api_stack( + enable_rl_module_and_learner=args.enable_new_api_stack, + enable_env_runner_and_connector_v2=args.enable_new_api_stack, ) + # Define EnvRunner/RolloutWorker scaling and behavior. + .env_runners(num_env_runners=args.num_env_runners) # Define compute resources used. .resources( # Old stack. From d7b1b0f629aff19a2ec50f04397869d2467205f5 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 11:18:38 +0200 Subject: [PATCH 07/15] wip Signed-off-by: sven1977 --- doc/source/rllib/rllib-training.rst | 2 +- rllib/algorithms/algorithm.py | 6 +-- rllib/algorithms/algorithm_config.py | 41 +++++++++---------- rllib/algorithms/impala/impala.py | 4 +- rllib/algorithms/sac/sac.py | 4 +- .../tests/test_callbacks_old_stack.py | 10 ++--- .../tests/test_callbacks_on_algorithm.py | 2 +- .../tests/test_callbacks_on_env_runner.py | 2 +- rllib/env/multi_agent_env_runner.py | 2 +- rllib/env/single_agent_env_runner.py | 6 +-- .../env/tests/test_single_agent_env_runner.py | 10 ++--- rllib/evaluation/rollout_worker.py | 2 +- rllib/evaluation/tests/test_rollout_worker.py | 28 ++++++------- rllib/examples/_docs/rllib_on_rllib_readme.py | 2 +- .../_old_api_stack/complex_struct_space.py | 2 +- .../connectors/prepare_checkpoint.py | 4 +- .../self_play_with_policy_checkpoint.py | 4 +- .../remote_base_env_with_custom_api.py | 6 +-- ...e_envs_with_inference_done_on_main_node.py | 6 +-- ...raining_step_on_and_off_policy_combined.py | 2 +- .../examples/catalogs/mobilenet_v2_encoder.py | 2 +- rllib/examples/checkpoints/onnx_torch.py | 2 +- .../curriculum/curriculum_learning.py | 2 +- .../debugging/deterministic_training.py | 4 +- .../envs/env_rendering_and_recording.py | 2 +- .../envs/external_envs/cartpole_server.py | 2 +- .../envs/external_envs/unity3d_server.py | 2 +- rllib/examples/envs/greyscale_env.py | 4 +- rllib/examples/envs/unity3d_env_local.py | 2 +- rllib/examples/gpus/fractional_gpus.py | 6 +-- .../hierarchical/hierarchical_training.py | 4 +- .../multi_agent/multi_agent_cartpole.py | 2 +- .../multi_agent/multi_agent_pendulum.py | 2 +- .../multi_agent/self_play_with_open_spiel.py | 4 +- rllib/examples/offline_rl/offline_rl.py | 2 +- rllib/examples/ray_tune/custom_experiment.py | 2 +- .../rl_modules/classes/mobilenet_rlm.py | 2 +- .../appo/cartpole-appo-separate-losses.py | 2 +- ...hing-and-stalling-recreate-workers-appo.py | 2 +- ...cartpole-crashing-recreate-workers-appo.py | 2 +- .../tuned_examples/dqn/benchmark_dqn_atari.py | 2 +- ...benchmark_dqn_atari_rllib_preprocessing.py | 2 +- rllib/utils/test_utils.py | 4 +- 43 files changed, 101 insertions(+), 104 deletions(-) diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 52915edbbc45..d6a2c1e8f249 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -220,7 +220,7 @@ These functions return values for each worker as a list. You can also access just the "master" copy of the algorithm state through ``Algorithm.get_policy()`` or ``Algorithm.workers.local_worker()``, but note that updates here may not be immediately reflected in -your rollout workers (if you have configured ``num_rollout_workers > 0``). +your rollout workers (if you have configured ``num_env_runners > 0``). Here's a quick example of how to access state of a model: .. literalinclude:: ./doc_code/getting_started.py diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index f0cda796eb78..6ce237fa763d 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -1428,7 +1428,7 @@ def _env_runner_remote(worker, num, round, iter): units_per_healthy_remote_worker = ( 1 if unit == "episodes" - else eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_worker + else eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_env_runner ) # Select proper number of evaluation workers for this round. selected_eval_worker_ids = [ @@ -2692,7 +2692,7 @@ def resource_help(cls, config: Union[AlgorithmConfig, AlgorithmConfigDict]) -> s "\n\nYou can adjust the resource requests of RLlib Algorithms by calling " "`AlgorithmConfig.resources(" "num_gpus=.., num_cpus_per_worker=.., num_gpus_per_worker=.., ..)` or " - "`AgorithmConfig.env_runners(num_rollout_workers=..)`. See " + "`AgorithmConfig.env_runners(num_env_runners=..)`. See " "the `ray.rllib.algorithms.algorithm_config.AlgorithmConfig` classes " "(each Algorithm has its own subclass of this class) for more info.\n\n" f"The config of this Algorithm is: {config}" @@ -3287,7 +3287,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread( # In case all the remote evaluation workers die during a round of # evaluation, we need to stop. units_per_healthy_remote_worker = ( - eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_worker + eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_env_runner ) # Select proper number of evaluation workers for this round. selected_eval_worker_ids = [ diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 1af59e78d8f2..8e6b2fb6bbd2 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -366,10 +366,6 @@ def __init__(self, algo_class: Optional[type] = None): self.use_worker_filter_stats = True self.enable_connectors = True self.sampler_perf_stats_ema_coef = None - # Deprecated args. - self.num_rollout_workers = DEPRECATED_VALUE - self.num_envs_per_worker = DEPRECATED_VALUE - self.validate_workers_after_construction = DEPRECATED_VALUE # `self.training()` self.gamma = 0.99 @@ -1679,8 +1675,8 @@ def env_runners( 1. RLlib collects 10 fragments of 100 steps each from rollout workers. 2. These fragments are concatenated and we perform an epoch of SGD. When using multiple envs per worker, the fragment size is multiplied by - `num_envs_per_worker`. This is since we are collecting steps from - multiple envs in parallel. For example, if num_envs_per_worker=5, then + `num_envs_per_env_runner`. This is since we are collecting steps from + multiple envs in parallel. For example, if num_envs_per_env_runner=5, then EnvRunners will return experiences in chunks of 5*100 = 500 steps. The dataflow here can vary per algorithm. For example, PPO further divides the train batch into minibatches for multi-epoch SGD. @@ -1692,7 +1688,7 @@ def env_runners( env- or agent-steps) and depends on the `count_steps_by` setting, adjustable via `AlgorithmConfig.multi_agent(count_steps_by=..)`: 1) "truncate_episodes": Each call to `EnvRunner.sample()` will return a - batch of at most `rollout_fragment_length * num_envs_per_worker` in + batch of at most `rollout_fragment_length * num_envs_per_env_runner` in size. The batch will be exactly `rollout_fragment_length * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. @@ -1700,17 +1696,17 @@ def env_runners( variance as the future return must now be estimated at truncation boundaries. 2) "complete_episodes": Each call to `EnvRunner.sample()` will return a - batch of at least `rollout_fragment_length * num_envs_per_worker` in + batch of at least `rollout_fragment_length * num_envs_per_env_runner` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the (minimum) batch size. - Note that when `num_envs_per_worker > 1`, episode steps will be buffered + Note that when `num_envs_per_env_runner > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. explore: Default exploration behavior, iff `explore=None` is passed into compute_action(s). Set to False for no exploration behavior (e.g., for evaluation). exploration_config: A dict specifying the Exploration object's config. - remote_worker_envs: If using num_envs_per_worker > 1, whether to create + remote_worker_envs: If using num_envs_per_env_runner > 1, whether to create those new envs in remote processes instead of in the same worker. This adds overheads, but can make sense if your envs can take much time to step / reset (e.g., for StarCraft). Use this cautiously; @@ -3034,7 +3030,7 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: Uses the simple formula: `rollout_fragment_length` = `total_train_batch_size` / - (`num_envs_per_worker` * `num_env_runners`) + (`num_envs_per_env_runner` * `num_env_runners`) If result is a fraction AND `worker_index` is provided, will make those workers add additional timesteps, such that the overall batch size (across @@ -3056,13 +3052,13 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: # -> 512 / 40 -> 12.8 -> diff=32 (12 * 40 = 480) # -> worker 1: 13, workers 2: 12 rollout_fragment_length = self.total_train_batch_size / ( - self.num_envs_per_worker * (self.num_env_runners or 1) + self.num_envs_per_env_runner * (self.num_env_runners or 1) ) if int(rollout_fragment_length) != rollout_fragment_length: diff = self.total_train_batch_size - int( rollout_fragment_length - ) * self.num_envs_per_worker * (self.num_env_runners or 1) - if ((worker_index - 1) * self.num_envs_per_worker) >= diff: + ) * self.num_envs_per_env_runner * (self.num_env_runners or 1) + if ((worker_index - 1) * self.num_envs_per_env_runner) >= diff: return int(rollout_fragment_length) else: return int(rollout_fragment_length) + 1 @@ -3399,7 +3395,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: dependent on rollout_fragment_length (synchronous sampling, on-policy PG algos). If rollout_fragment_length != "auto", makes sure that the product of - `rollout_fragment_length` x `num_env_runners` x `num_envs_per_worker` + `rollout_fragment_length` x `num_env_runners` x `num_envs_per_env_runner` roughly (10%) matches the provided `train_batch_size`. Otherwise, errors with asking the user to set rollout_fragment_length to `auto` or to a matching value. @@ -3418,7 +3414,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: ): min_batch_size = ( max(self.num_env_runners, 1) - * self.num_envs_per_worker + * self.num_envs_per_env_runner * self.rollout_fragment_length ) batch_size = min_batch_size @@ -3430,7 +3426,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: 0.1 * self.total_train_batch_size ): suggested_rollout_fragment_length = self.total_train_batch_size // ( - self.num_envs_per_worker * (self.num_env_runners or 1) + self.num_envs_per_env_runner * (self.num_env_runners or 1) ) raise ValueError( "Your desired `total_train_batch_size` " @@ -3438,7 +3434,7 @@ def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: f"learners x {self.train_batch_size_per_learner}) " "or a value 10% off of that cannot be achieved with your other " f"settings (num_env_runners={self.num_env_runners}; " - f"num_envs_per_worker={self.num_envs_per_worker}; " + f"num_envs_per_env_runner={self.num_envs_per_env_runner}; " f"rollout_fragment_length={self.rollout_fragment_length})! " "Try setting `rollout_fragment_length` to 'auto' OR to a value of " f"{suggested_rollout_fragment_length}." @@ -3944,12 +3940,13 @@ def _validate_multi_agent_settings(self): if ( self.is_multi_agent() and self.enable_env_runner_and_connector_v2 - and self.num_envs_per_worker > 1 + and self.num_envs_per_env_runner > 1 ): raise ValueError( - "For now, using env vectorization (`config.num_envs_per_worker > 1`) " - "in combination with multi-agent AND the new EnvRunners is not " - "supported! Try setting `config.num_envs_per_worker = 1`." + "For now, using env vectorization " + "(`config.num_envs_per_env_runner > 1`) in combination with " + "multi-agent AND the new EnvRunners is not supported! Try setting " + "`config.num_envs_per_env_runner = 1`." ) def _validate_evaluation_settings(self): diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index c4ee2bfb60ec..9f2b22fc2aac 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -554,9 +554,9 @@ class Impala(Algorithm): == Overview of data flow in IMPALA == 1. Policy evaluation in parallel across `num_workers` actors produces - batches of size `rollout_fragment_length * num_envs_per_worker`. + batches of size `rollout_fragment_length * num_envs_per_env_runner`. 2. If enabled, the replay buffer stores and produces batches of size - `rollout_fragment_length * num_envs_per_worker`. + `rollout_fragment_length * num_envs_per_env_runner`. 3. If enabled, the minibatch ring buffer stores and replays batches of size `train_batch_size` up to `num_sgd_iter` times per batch. 4. The learner thread executes data parallel SGD across `num_gpus` GPUs diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 32489cdeb4b6..0d8aa17e399a 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -217,7 +217,7 @@ def training( collecting samples from the env). If None, uses "natural" values of: `train_batch_size` / (`rollout_fragment_length` x `num_workers` x - `num_envs_per_worker`). + `num_envs_per_env_runner`). If not None, will make sure that the ratio between timesteps inserted into and sampled from th buffer matches the given values. Example: @@ -225,7 +225,7 @@ def training( train_batch_size=250 rollout_fragment_length=1 num_workers=1 (or 0) - num_envs_per_worker=1 + num_envs_per_env_runner=1 -> natural value = 250 / 1 = 250.0 -> will make sure that replay+train op will be executed 4x asoften as rollout+insert op (4 * 250 = 1000). diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py index f01b86151711..11fea5c94a7a 100644 --- a/rllib/algorithms/tests/test_callbacks_old_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_stack.py @@ -78,7 +78,7 @@ def test_episode_and_sample_callbacks(self): config = ( PPOConfig() .environment("CartPole-v1") - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .callbacks(EpisodeAndSampleCallbacks) .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) ) @@ -99,7 +99,7 @@ def test_on_sub_environment_created(self): dqn.DQNConfig().environment("CartPole-v1") # Create 4 sub-environments per remote worker. # Create 2 remote workers. - .env_runners(num_envs_per_worker=4, num_rollout_workers=2) + .env_runners(num_envs_per_env_runner=4, num_env_runners=2) ) for callbacks in ( @@ -135,10 +135,10 @@ def test_on_sub_environment_created_with_remote_envs(self): # Make each sub-environment a ray actor. remote_worker_envs=True, # Create 2 remote workers. - num_rollout_workers=2, + num_env_runners=2, # Create 4 sub-environments (ray remote actors) per remote # worker. - num_envs_per_worker=4, + num_envs_per_env_runner=4, ) ) @@ -179,7 +179,7 @@ def test_on_episode_created(self): "p_terminated": 0.0, }, ) - .env_runners(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_env_runner=2, num_env_runners=1) .callbacks(OnEpisodeCreatedCallback) ) diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py index d9f6b4cdc718..c3533ab6ac8b 100644 --- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py +++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py @@ -58,7 +58,7 @@ def test_on_workers_recreated_callback(self): APPOConfig() .environment("env") .callbacks(OnWorkersRecreatedCallbacks) - .env_runners(num_rollout_workers=3) + .env_runners(num_env_runners=3) .fault_tolerance( recreate_failed_env_runners=True, delay_between_env_runner_restarts_s=0, diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py index 6848503933b9..062f39a99f01 100644 --- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py +++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py @@ -123,7 +123,7 @@ def test_episode_and_sample_callbacks_batch_mode_complete_episodes(self): .environment("CartPole-v1") .env_runners( batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ) .callbacks(EpisodeAndSampleCallbacks) .training( diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 67bc34f8885b..547dddf18359 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -758,7 +758,7 @@ def make_env(self): env_ctx = EnvContext( env_ctx, worker_index=self.worker_index, - num_workers=self.config.num_rollout_workers, + num_workers=self.config.num_env_runners, remote=self.config.remote_worker_envs, ) diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 5a4457dd7762..a468de8990af 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -704,7 +704,7 @@ def make_env(self) -> None: env_ctx = EnvContext( env_ctx, worker_index=self.worker_index, - num_workers=self.config.num_rollout_workers, + num_workers=self.config.num_env_runners, remote=self.config.remote_worker_envs, ) @@ -730,12 +730,12 @@ def make_env(self) -> None: self.env: gym.Wrapper = gym.wrappers.VectorListInfo( gym.vector.make( "rllib-single-agent-env-v0", - num_envs=self.config.num_envs_per_worker, + num_envs=self.config.num_envs_per_env_runner, asynchronous=self.config.remote_worker_envs, ) ) self.num_envs: int = self.env.num_envs - assert self.num_envs == self.config.num_envs_per_worker + assert self.num_envs == self.config.num_envs_per_env_runner # Set the flag to reset all envs upon the next `sample()` call. self._needs_initial_reset = True diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py index 2045963e899c..83c7bf083c22 100644 --- a/rllib/env/tests/test_single_agent_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -18,7 +18,7 @@ def test_sample(self): config = ( AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. - .env_runners(num_envs_per_worker=2, rollout_fragment_length=64) + .env_runners(num_envs_per_env_runner=2, rollout_fragment_length=64) ) env_runner = SingleAgentEnvRunner(config=config) @@ -64,8 +64,8 @@ def test_distributed_env_runner(self): AlgorithmConfig().environment("CartPole-v1") # Vectorize x2 and by default, rollout 64 timesteps per individual env. .env_runners( - num_rollout_workers=5, - num_envs_per_worker=5, + num_env_runners=5, + num_envs_per_env_runner=5, rollout_fragment_length=10, remote_worker_envs=envs_parallel, ) @@ -73,7 +73,7 @@ def test_distributed_env_runner(self): array = [ remote_class.remote(config=config) - for _ in range(config.num_rollout_workers) + for _ in range(config.num_env_runners) ] # Sample in parallel. results = [a.sample.remote(random_actions=True) for a in array] @@ -83,7 +83,7 @@ def test_distributed_env_runner(self): # Assert length of all fragments is `rollout_fragment_length`. self.assertEqual( sum(len(e) for e in episodes), - config.num_envs_per_worker * config.rollout_fragment_length, + config.num_envs_per_env_runner * config.rollout_fragment_length, ) diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index cad7795680bb..d7fb7101fee8 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -695,7 +695,7 @@ def sample(self, **kwargs) -> SampleBatchType: self.config.batch_mode == "truncate_episodes" and not self.config.offline_sampling ): - max_batches = self.config.num_envs_per_worker + max_batches = self.config.num_envs_per_env_runner else: max_batches = float("inf") while steps_so_far < self.total_rollout_fragment_length and ( diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 254e515e3133..6540cb187bc3 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -684,7 +684,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=15, - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", ), ) @@ -700,7 +700,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .env_runners( - num_rollout_workers=0, + num_env_runners=0, batch_mode="truncate_episodes", rollout_fragment_length=301, ) @@ -725,7 +725,7 @@ def test_truncate_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .env_runners( - num_rollout_workers=0, + num_env_runners=0, rollout_fragment_length=301, ) .multi_agent( @@ -754,7 +754,7 @@ def test_complete_episodes(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=5, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) @@ -768,7 +768,7 @@ def test_complete_episodes_packing(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( rollout_fragment_length=15, - num_rollout_workers=0, + num_env_runners=0, batch_mode="complete_episodes", ), ) @@ -786,7 +786,7 @@ def test_filter_sync(self): env_creator=lambda _: gym.make("CartPole-v1"), default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( - num_rollout_workers=0, + num_env_runners=0, observation_filter="ConcurrentMeanStdFilter", ), ) @@ -804,7 +804,7 @@ def test_get_filters(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", - num_rollout_workers=0, + num_env_runners=0, ), ) self.sample_and_flush(ev) @@ -823,7 +823,7 @@ def test_sync_filter(self): default_policy_class=MockPolicy, config=AlgorithmConfig().env_runners( observation_filter="ConcurrentMeanStdFilter", - num_rollout_workers=0, + num_env_runners=0, ), ) obs_f = self.sample_and_flush(ev) @@ -852,7 +852,7 @@ def test_extra_python_envs(self): default_policy_class=MockPolicy, config=AlgorithmConfig() .python_environment(extra_python_environs_for_driver=extra_envs) - .env_runners(num_rollout_workers=0), + .env_runners(num_env_runners=0), ) self.assertTrue("env_key_1" in os.environ) self.assertTrue("env_key_2" in os.environ) @@ -876,7 +876,7 @@ def test_multi_env_seed(self): env_creator=lambda _: MockEnv2(100), default_policy_class=MockPolicy, config=AlgorithmConfig() - .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_env_runners=0) .debugging(seed=1), ) # Make sure we can properly sample from the wrapped env. @@ -911,7 +911,7 @@ def step(self, action_dict): env_creator=lambda _: MockMultiAgentEnv(), default_policy_class=MockPolicy, config=AlgorithmConfig() - .env_runners(num_envs_per_env_runner=3, num_rollout_workers=0) + .env_runners(num_envs_per_env_runner=3, num_env_runners=0) .multi_agent(policies={"policy_1", "policy_2"}) .debugging(seed=1), ) @@ -929,7 +929,7 @@ def test_wrap_multi_agent_env(self): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) # Make sure we can properly sample from the wrapped env. @@ -961,7 +961,7 @@ def step(self, action): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) batch = ev.sample() @@ -976,7 +976,7 @@ def step(self, action): config=AlgorithmConfig().env_runners( rollout_fragment_length=5, batch_mode="complete_episodes", - num_rollout_workers=0, + num_env_runners=0, ), ) batch = ev.sample() diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py index 1c3c2d330b4c..d0e9be49a97d 100644 --- a/rllib/examples/_docs/rllib_on_rllib_readme.py +++ b/rllib/examples/_docs/rllib_on_rllib_readme.py @@ -59,7 +59,7 @@ def step(self, action): env_config={"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))}, ) # Parallelize environment rollouts. - .env_runners(num_rollout_workers=3) + .env_runners(num_env_runners=3) ) algo = config.build() diff --git a/rllib/examples/_old_api_stack/complex_struct_space.py b/rllib/examples/_old_api_stack/complex_struct_space.py index c2e45b406c7b..075b8831d04c 100644 --- a/rllib/examples/_old_api_stack/complex_struct_space.py +++ b/rllib/examples/_old_api_stack/complex_struct_space.py @@ -40,7 +40,7 @@ PPOConfig() .environment(SimpleRPG) .framework(args.framework) - .env_runners(rollout_fragment_length=1, num_rollout_workers=0) + .env_runners(rollout_fragment_length=1, num_env_runners=0) .training(train_batch_size=2, model={"custom_model": "my_model"}) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py index ae191e78513a..35d151341fcb 100644 --- a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py @@ -23,8 +23,8 @@ def _policy_mapping_fn(*args, **kwargs): # and use a TF policy in a Torch training stack. .framework("tf2") .env_runners( - num_rollout_workers=1, - num_envs_per_worker=5, + num_env_runners=1, + num_envs_per_env_runner=5, # We will be restoring a TF2 policy. # So tell the RolloutWorkers to enable TF eager exec as well, even if # framework is set to torch. diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py index cae3c2493c82..38531c626b5f 100644 --- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py @@ -73,8 +73,8 @@ def main(checkpoint_dir): .framework("torch") .callbacks(partial(AddPolicyCallback, checkpoint_dir)) .env_runners( - num_rollout_workers=1, - num_envs_per_worker=5, + num_env_runners=1, + num_envs_per_env_runner=5, # We will be restoring a TF2 policy. # So tell the RolloutWorkers to enable TF eager exec as well, even if # framework is set to torch. diff --git a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py index a46d6b628133..77b47fb23083 100644 --- a/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py +++ b/rllib/examples/_old_api_stack/remote_base_env_with_custom_api.py @@ -120,11 +120,11 @@ def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None: # through them in parallel. remote_worker_envs=True, # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_worker`)? - num_rollout_workers=args.num_workers, + # `num_envs_per_env_runner`)? + num_env_runners=args.num_workers, # This setting should not really matter as it does not affect the # number of GPUs reserved for each worker. - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_env_runner, ) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) diff --git a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py index 52b77840613e..014c6e9fc948 100644 --- a/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py +++ b/rllib/examples/_old_api_stack/remote_envs_with_inference_done_on_main_node.py @@ -108,7 +108,7 @@ def default_resource_request( { # Different bundle (meaning: possibly different node) # for your n "remote" envs (set remote_worker_envs=True). - "CPU": cf.num_envs_per_worker, + "CPU": cf.num_envs_per_env_runner, }, ], strategy=cf.placement_strategy, @@ -128,12 +128,12 @@ def default_resource_request( # Force sub-envs to be ray.actor.ActorHandles, so we can step # through them in parallel. remote_worker_envs=True, - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_worker, # Use a single worker (however, with n parallelized remote envs, maybe # even running on another node). # Action computations occur on the "main" (GPU?) node, while # the envs run on one or more CPU node(s). - num_rollout_workers=0, + num_env_runners=0, ) .resources( # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. diff --git a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py index 71d67e48ba92..5bfe1dd513f7 100644 --- a/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py +++ b/rllib/examples/algorithms/custom_training_step_on_and_off_policy_combined.py @@ -196,7 +196,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): .environment("multi_agent_cartpole") .framework("torch" if args.torch else "tf") .multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - .env_runners(num_rollout_workers=0, rollout_fragment_length=50) + .env_runners(num_env_runners=0, rollout_fragment_length=50) # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) .reporting(metrics_num_episodes_for_smoothing=30) diff --git a/rllib/examples/catalogs/mobilenet_v2_encoder.py b/rllib/examples/catalogs/mobilenet_v2_encoder.py index c2e48cbab0c5..beebdb79f773 100644 --- a/rllib/examples/catalogs/mobilenet_v2_encoder.py +++ b/rllib/examples/catalogs/mobilenet_v2_encoder.py @@ -50,7 +50,7 @@ def _get_encoder_config( catalog_class=MobileNetEnhancedPPOCatalog ) ) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/examples/checkpoints/onnx_torch.py b/rllib/examples/checkpoints/onnx_torch.py index 963c665fc466..77a1ffb5f28a 100644 --- a/rllib/examples/checkpoints/onnx_torch.py +++ b/rllib/examples/checkpoints/onnx_torch.py @@ -15,7 +15,7 @@ ppo.PPOConfig() # ONNX is not supported by RLModule API yet. .api_stack(enable_rl_module_and_learner=False) - .env_runners(num_rollout_workers=1) + .env_runners(num_env_runners=1) .framework("torch") ) diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 2f5dd21f3c4a..b0cb6865e98a 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -214,7 +214,7 @@ def on_train_result( model={"vf_share_layers": True}, ) .env_runners( - num_envs_per_worker=5, + num_envs_per_env_runner=5, env_to_module_connector=lambda env: [ AddObservationsFromEpisodesToBatch(), FlattenObservations(), diff --git a/rllib/examples/debugging/deterministic_training.py b/rllib/examples/debugging/deterministic_training.py index a59327c7e919..12bccb28f508 100644 --- a/rllib/examples/debugging/deterministic_training.py +++ b/rllib/examples/debugging/deterministic_training.py @@ -40,8 +40,8 @@ ) .framework(args.framework) .env_runners( - num_rollout_workers=1, - num_envs_per_worker=2, + num_env_runners=1, + num_envs_per_env_runner=2, rollout_fragment_length=50, ) .resources( diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py index de915f8be8e5..e5fcd4891e59 100644 --- a/rllib/examples/envs/env_rendering_and_recording.py +++ b/rllib/examples/envs/env_rendering_and_recording.py @@ -101,7 +101,7 @@ def render(self, mode="rgb"): ) .framework(args.framework) # Use a vectorized env with 2 sub-envs. - .env_runners(num_envs_per_worker=2, num_rollout_workers=1) + .env_runners(num_envs_per_env_runner=2, num_env_runners=1) .evaluation( # Evaluate once per training iteration. evaluation_interval=1, diff --git a/rllib/examples/envs/external_envs/cartpole_server.py b/rllib/examples/envs/external_envs/cartpole_server.py index 81a291e2d0e0..d6c661590387 100755 --- a/rllib/examples/envs/external_envs/cartpole_server.py +++ b/rllib/examples/envs/external_envs/cartpole_server.py @@ -174,7 +174,7 @@ def _input(ioctx): .offline_data(input_=_input) # Use n worker processes to listen on different ports. .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, # Connectors are not compatible with the external env. enable_connectors=False, ) diff --git a/rllib/examples/envs/external_envs/unity3d_server.py b/rllib/examples/envs/external_envs/unity3d_server.py index 00129aea074b..e5b17ca1d16f 100755 --- a/rllib/examples/envs/external_envs/unity3d_server.py +++ b/rllib/examples/envs/external_envs/unity3d_server.py @@ -133,7 +133,7 @@ def _input(ioctx): .framework(args.framework) # Use n worker processes to listen on different ports. .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, rollout_fragment_length=20, enable_connectors=False, ) diff --git a/rllib/examples/envs/greyscale_env.py b/rllib/examples/envs/greyscale_env.py index 480b0c77c41d..9278b53ec2a9 100644 --- a/rllib/examples/envs/greyscale_env.py +++ b/rllib/examples/envs/greyscale_env.py @@ -83,8 +83,8 @@ def env_creator(config): PPOConfig() .environment("pistonball", env_config={"local_ratio": 0.5}, clip_rewards=True) .env_runners( - num_rollout_workers=15 if not args.as_test else 2, - num_envs_per_worker=1, + num_env_runners=15 if not args.as_test else 2, + num_envs_per_env_runner=1, observation_filter="NoFilter", rollout_fragment_length="auto", ) diff --git a/rllib/examples/envs/unity3d_env_local.py b/rllib/examples/envs/unity3d_env_local.py index 46bc9f95af5b..bfe4a4a4a165 100644 --- a/rllib/examples/envs/unity3d_env_local.py +++ b/rllib/examples/envs/unity3d_env_local.py @@ -132,7 +132,7 @@ # For running in editor, force to use just one Worker (we only have # one Unity running)! .env_runners( - num_rollout_workers=args.num_workers if args.file_name else 0, + num_env_runners=args.num_workers if args.file_name else 0, rollout_fragment_length=200, ) .training( diff --git a/rllib/examples/gpus/fractional_gpus.py b/rllib/examples/gpus/fractional_gpus.py index 6b70fd621b34..ad87ba866e46 100644 --- a/rllib/examples/gpus/fractional_gpus.py +++ b/rllib/examples/gpus/fractional_gpus.py @@ -95,12 +95,12 @@ num_gpus_per_worker=args.num_gpus_per_worker, ) # How many RolloutWorkers (each with n environment copies: - # `num_envs_per_worker`)? + # `num_envs_per_env_runner`)? .env_runners( - num_rollout_workers=args.num_workers, + num_env_runners=args.num_workers, # This setting should not really matter as it does not affect the # number of GPUs reserved for each worker. - num_envs_per_worker=args.num_envs_per_worker, + num_envs_per_env_runner=args.num_envs_per_worker, ) # 4 tune trials altogether. .training(lr=tune.grid_search([0.005, 0.003, 0.001, 0.0001])) diff --git a/rllib/examples/hierarchical/hierarchical_training.py b/rllib/examples/hierarchical/hierarchical_training.py index c62ee4b73ce3..76f23907b652 100644 --- a/rllib/examples/hierarchical/hierarchical_training.py +++ b/rllib/examples/hierarchical/hierarchical_training.py @@ -86,7 +86,7 @@ param_space=( PPOConfig() .environment(WindyMazeEnv) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .framework(args.framework) ).to_dict(), ).fit() @@ -103,7 +103,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): PPOConfig() .environment(HierarchicalWindyMazeEnv) .framework(args.framework) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training(entropy_coeff=0.01) .multi_agent( policies={ diff --git a/rllib/examples/multi_agent/multi_agent_cartpole.py b/rllib/examples/multi_agent/multi_agent_cartpole.py index b26f9b6ecb1b..4bdf019f10b1 100644 --- a/rllib/examples/multi_agent/multi_agent_cartpole.py +++ b/rllib/examples/multi_agent/multi_agent_cartpole.py @@ -51,7 +51,7 @@ # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's # MultiAgentEnv API. - num_envs_per_worker=1 + num_envs_per_env_runner=1 if args.num_agents > 0 else 20, ) diff --git a/rllib/examples/multi_agent/multi_agent_pendulum.py b/rllib/examples/multi_agent/multi_agent_pendulum.py index 00e73bafd3c5..757bed5cb76e 100644 --- a/rllib/examples/multi_agent/multi_agent_pendulum.py +++ b/rllib/examples/multi_agent/multi_agent_pendulum.py @@ -47,7 +47,7 @@ get_trainable_cls(args.algo) .get_default_config() .environment("env" if args.num_agents > 0 else "Pendulum-v1") - .env_runners(num_rollout_workers=4) + .env_runners(num_env_runners=4) .training( train_batch_size_per_learner=512, mini_batch_size_per_learner=64, diff --git a/rllib/examples/multi_agent/self_play_with_open_spiel.py b/rllib/examples/multi_agent/self_play_with_open_spiel.py index 2c8cb85e0e54..c6cccbbb2c28 100644 --- a/rllib/examples/multi_agent/self_play_with_open_spiel.py +++ b/rllib/examples/multi_agent/self_play_with_open_spiel.py @@ -125,8 +125,8 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ) ) .env_runners( - num_rollout_workers=args.num_env_runners, - num_envs_per_worker=1 if args.enable_new_api_stack else 5, + num_env_runners=args.num_env_runners, + num_envs_per_env_runner=1 if args.enable_new_api_stack else 5, ) .resources( num_learner_workers=args.num_gpus, diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index 4d4f0803cf45..5ad0bef527d3 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -53,7 +53,7 @@ config = ( cql.CQLConfig() .framework(framework="torch") - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) .training( n_step=3, bc_iters=0, diff --git a/rllib/examples/ray_tune/custom_experiment.py b/rllib/examples/ray_tune/custom_experiment.py index 5e0e441eeac0..dbb393d290dc 100644 --- a/rllib/examples/ray_tune/custom_experiment.py +++ b/rllib/examples/ray_tune/custom_experiment.py @@ -93,7 +93,7 @@ def my_experiment(config: Dict): # Set the number of EnvRunners for collecting training data to 0 (local # worker only). - config.env_runners(num_rollout_workers=0) + config.env_runners(num_env_runners=0) eval_algo = config.build() # Load state from the low-lr algo into this one. diff --git a/rllib/examples/rl_modules/classes/mobilenet_rlm.py b/rllib/examples/rl_modules/classes/mobilenet_rlm.py index 1878ac75b675..f31ae4f1c6d4 100644 --- a/rllib/examples/rl_modules/classes/mobilenet_rlm.py +++ b/rllib/examples/rl_modules/classes/mobilenet_rlm.py @@ -73,7 +73,7 @@ def setup(self): ), }, ) - .env_runners(num_rollout_workers=0) + .env_runners(num_env_runners=0) # The following training settings make it so that a training iteration is very # quick. This is just for the sake of this example. PPO will not learn properly # with these settings! diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py index dcce4afc042b..730314303263 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py +++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py @@ -28,7 +28,7 @@ ) .env_runners( num_envs_per_env_runner=5, - num_rollout_workers=1, + num_env_runners=1, observation_filter="MeanStdFilter", ) .resources(num_gpus=0) diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py index dc68afcddf1a..946a65ad1042 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-and-stalling-recreate-workers-appo.py @@ -37,7 +37,7 @@ ) .env_runners( num_env_runners=3, - num_envs_per_worker=1, + num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). .fault_tolerance( diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py index 970e002b3633..4ac5afd7ea7f 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-recreate-workers-appo.py @@ -31,7 +31,7 @@ }, ) .env_runners( - num_rollout_workers=4, + num_env_runners=4, num_envs_per_env_runner=1, ) # Switch on resiliency (recreate any failed worker). diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index 52f4c658f7f8..83e0b1efefec 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -303,7 +303,7 @@ def stop_all(self): .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - num_rollout_workers=1, + num_env_runners=1, env_to_module_connector=_make_env_to_module_connector, ) # TODO (simon): Adjust to new model_config_dict. diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 236c376b0414..022613f29570 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -296,7 +296,7 @@ def stop_all(self): .env_runners( # Every 4 agent steps a training update is performed. rollout_fragment_length=4, - num_rollout_workers=1, + num_env_runners=1, ) .resources( # We have a train/sample ratio of 1:1 and a batch of 32. diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 88c23abb2b2c..9f7d33f44bd5 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1572,7 +1572,7 @@ def check_reproducibilty( num_gpus: int(os.environ.get("RLLIB_NUM_GPUS", "0")) num_workers: 0 (only local workers) or 4 ((1) local workers + (4) remote workers) - num_envs_per_worker: 2 + num_envs_per_env_runner: 2 Args: algo_class: Algorithm class to test. @@ -1603,7 +1603,7 @@ def check_reproducibilty( # new API num_gpus_per_learner_worker=int(os.environ.get("RLLIB_NUM_GPUS", "0")), ) - .env_runners(num_rollout_workers=num_workers, num_envs_per_worker=2) + .env_runners(num_env_runners=num_workers, num_envs_per_env_runner=2) ) for fw in framework_iterator(algo_config, **fw_kwargs): From 215248a362e65172a4523bfa609d6d1b16dd3235 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 11:24:00 +0200 Subject: [PATCH 08/15] wip Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 3 ++- rllib/algorithms/algorithm_config.py | 8 ++++---- rllib/tests/backward_compat/test_backward_compat.py | 6 +++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 6ce237fa763d..275cc836fcb9 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -1428,7 +1428,8 @@ def _env_runner_remote(worker, num, round, iter): units_per_healthy_remote_worker = ( 1 if unit == "episodes" - else eval_cfg.rollout_fragment_length * eval_cfg.num_envs_per_env_runner + else eval_cfg.rollout_fragment_length + * eval_cfg.num_envs_per_env_runner ) # Select proper number of evaluation workers for this round. selected_eval_worker_ids = [ diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 886dd434fb99..000524b05d93 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1676,8 +1676,8 @@ def env_runners( 2. These fragments are concatenated and we perform an epoch of SGD. When using multiple envs per worker, the fragment size is multiplied by `num_envs_per_env_runner`. This is since we are collecting steps from - multiple envs in parallel. For example, if num_envs_per_env_runner=5, then - EnvRunners will return experiences in chunks of 5*100 = 500 steps. + multiple envs in parallel. For example, if num_envs_per_env_runner=5, + then EnvRunners will return experiences in chunks of 5*100 = 500 steps. The dataflow here can vary per algorithm. For example, PPO further divides the train batch into minibatches for multi-epoch SGD. Set `rollout_fragment_length` to "auto" to have RLlib compute an exact @@ -1699,8 +1699,8 @@ def env_runners( batch of at least `rollout_fragment_length * num_envs_per_env_runner` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the (minimum) batch size. - Note that when `num_envs_per_env_runner > 1`, episode steps will be buffered - until the episode completes, and hence batches may contain + Note that when `num_envs_per_env_runner > 1`, episode steps will be + buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. explore: Default exploration behavior, iff `explore=None` is passed into compute_action(s). Set to False for no exploration behavior (e.g., diff --git a/rllib/tests/backward_compat/test_backward_compat.py b/rllib/tests/backward_compat/test_backward_compat.py index c58ea206bf0c..5386aaf925b2 100644 --- a/rllib/tests/backward_compat/test_backward_compat.py +++ b/rllib/tests/backward_compat/test_backward_compat.py @@ -92,6 +92,9 @@ def test_old_algorithm_config_dicts(self): "policies_to_train": ["pol1"], "policy_mapping_fn": lambda aid, episode, worker, **kwargs: "pol1", }, + # Test, whether both keys (that map to the same new key) still work. + "num_workers": 2, + "num_rollout_workers": 2, } config = AlgorithmConfig.from_dict(config_dict) self.assertFalse(config.in_evaluation) @@ -101,6 +104,7 @@ def test_old_algorithm_config_dicts(self): eval_config = config.get_evaluation_config_object() self.assertTrue(eval_config.in_evaluation) self.assertTrue(eval_config.lr == 0.1) + self.assertTrue(config.num_env_runners == 2) register_env( "test", @@ -114,7 +118,7 @@ def test_old_algorithm_config_dicts(self): }, "lr": 0.001, "evaluation_config": { - "num_envs_per_env_runner": 4, + "num_envs_per_worker": 4, # old key -> num_envs_per_env_runner "explore": False, }, "evaluation_num_env_runners": 1, From 6e46aaeccfc696920450d5ebaa872610ec8ca4a8 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 13:32:08 +0200 Subject: [PATCH 09/15] fixes and LINT Signed-off-by: sven1977 --- rllib/algorithms/algorithm_config.py | 1 + rllib/connectors/agent/state_buffer.py | 2 +- rllib/connectors/agent/view_requirement.py | 2 +- rllib/evaluation/env_runner_v2.py | 6 ++++-- rllib/tuned_examples/dqn/benchmark_dqn_atari.py | 4 ++-- .../dqn/benchmark_dqn_atari_rllib_preprocessing.py | 2 +- 6 files changed, 10 insertions(+), 7 deletions(-) diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 000524b05d93..964efa0875cf 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -634,6 +634,7 @@ def to_dict(self) -> AlgorithmConfigDict: "min_train_timesteps_per_reporting", "min_sample_timesteps_per_reporting", "input_evaluation", + "_enable_new_api_stack", ]: if config.get(dep_k) == DEPRECATED_VALUE: config.pop(dep_k, None) diff --git a/rllib/connectors/agent/state_buffer.py b/rllib/connectors/agent/state_buffer.py index aa3f8e94fb06..bb235db2ab8d 100644 --- a/rllib/connectors/agent/state_buffer.py +++ b/rllib/connectors/agent/state_buffer.py @@ -67,7 +67,7 @@ def on_policy_output(self, ac_data: ActionConnectorDataType): def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType: d = ac_data.data assert ( - type(d) == dict + type(d) is dict ), "Single agent data must be of type Dict[str, TensorStructType]" env_id = ac_data.env_id diff --git a/rllib/connectors/agent/view_requirement.py b/rllib/connectors/agent/view_requirement.py index f95e3fec8d35..7bfe7270102c 100644 --- a/rllib/connectors/agent/view_requirement.py +++ b/rllib/connectors/agent/view_requirement.py @@ -68,7 +68,7 @@ def reset(self, env_id: str): def transform(self, ac_data: AgentConnectorDataType) -> AgentConnectorDataType: d = ac_data.data assert ( - type(d) == dict + type(d) is dict ), "Single agent data must be of type Dict[str, TensorStructType]" env_id = ac_data.env_id diff --git a/rllib/evaluation/env_runner_v2.py b/rllib/evaluation/env_runner_v2.py index 8b6a0910bacf..fc488f8e8ee2 100644 --- a/rllib/evaluation/env_runner_v2.py +++ b/rllib/evaluation/env_runner_v2.py @@ -1147,11 +1147,13 @@ def _process_policy_eval_results( input_dict: TensorStructType = eval_data[i].data.raw_dict rnn_states: List[StateBatches] = tree.map_structure( - lambda x: x[i], rnn_out + lambda x, i=i: x[i], rnn_out ) # extra_action_out could be a nested dict - fetches: Dict = tree.map_structure(lambda x: x[i], extra_action_out) + fetches: Dict = tree.map_structure( + lambda x, i=i: x[i], extra_action_out + ) # Post-process policy output by running them through action connectors. ac_data = ActionConnectorDataType( diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py index 83e0b1efefec..af5f352600bd 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari.py @@ -236,8 +236,8 @@ for env in benchmark_envs.keys(): tune.register_env( env, - lambda ctx: AtariPreprocessing( - gym.make(env, **ctx), grayscale_newaxis=True, screen_size=84, noop_max=0 + lambda ctx, e=env: AtariPreprocessing( + gym.make(e, **ctx), grayscale_newaxis=True, screen_size=84, noop_max=0 ), ) diff --git a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py index 022613f29570..9fab4f934362 100644 --- a/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py +++ b/rllib/tuned_examples/dqn/benchmark_dqn_atari_rllib_preprocessing.py @@ -236,7 +236,7 @@ env, # Use the RLlib atari wrapper to squeeze images to 84x84. # Note, the default of this wrapper is `framestack=4`. - lambda ctx: wrap_atari_for_new_api_stack(gym.make(env, **ctx), dim=84), + lambda ctx, e=env: wrap_atari_for_new_api_stack(gym.make(e, **ctx), dim=84), ) From f018bcb1554a66c3155ce149a9fbc04fd4ba008b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 14:02:46 +0200 Subject: [PATCH 10/15] fixes and LINT Signed-off-by: sven1977 --- rllib/algorithms/algorithm.py | 30 +++++++++------- rllib/algorithms/algorithm_config.py | 34 ++++++++++++++++--- rllib/algorithms/sac/sac.py | 2 +- .../evaluation_parallel_to_training.py | 2 +- 4 files changed, 49 insertions(+), 19 deletions(-) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 275cc836fcb9..53d0a7b3ed4b 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -795,7 +795,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Sync the weights from the learner group to the rollout workers. weights = self.learner_group.get_weights( - inference_only=self.config.uses_new_env_runners + inference_only=self.config.enable_env_runner_and_connector_v2 ) local_worker.set_weights(weights) self.workers.sync_weights(inference_only=True) @@ -1053,7 +1053,7 @@ def evaluate( else: pass - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: # Lifetime eval counters. self.metrics.log_dict( { @@ -1171,7 +1171,7 @@ def _evaluate_on_local_env_runner(self, env_runner): env_runner_results = env_runner.get_metrics() - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( env_runner_results, env_runner_results, @@ -1299,7 +1299,7 @@ def _env_runner_remote(worker, num, round, iter): "recreate_failed_env_runners=True)` setting." ) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( all_metrics, all_metrics, @@ -1488,7 +1488,7 @@ def _env_runner_remote(worker, num, round, iter): "recreate_failed_env_runners=True)` setting." ) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: env_runner_results = summarize_episodes( all_metrics, all_metrics, @@ -1623,7 +1623,9 @@ def training_step(self) -> ResultDict: worker_set=self.workers, max_agent_steps=self.config.train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) else: @@ -1631,7 +1633,9 @@ def training_step(self) -> ResultDict: worker_set=self.workers, max_env_steps=self.config.train_batch_size, sample_timeout_s=self.config.sample_timeout_s, - _uses_new_env_runners=self.config.uses_new_env_runners, + _uses_new_env_runners=( + self.config.enable_env_runner_and_connector_v2 + ), _return_metrics=True, ) train_batch = train_batch.as_multi_agent() @@ -2444,7 +2448,7 @@ def load_checkpoint(self, checkpoint_dir: str) -> None: self.learner_group.load_state(learner_state_dir) # Make also sure, all training EnvRunners get the just loaded weights. weights = self.learner_group.get_weights( - inference_only=self.config.uses_new_env_runners + inference_only=self.config.enable_env_runner_and_connector_v2 ) self.workers.local_worker().set_weights(weights) self.workers.sync_weights(inference_only=True) @@ -2867,7 +2871,7 @@ def __getstate__(self) -> Dict: state["local_replay_buffer"] = self.local_replay_buffer.get_state() # New API stack: Save entire MetricsLogger state. - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: state["metrics_logger"] = self.metrics.get_state() # Old API stack: Save only counters. else: @@ -3208,7 +3212,7 @@ def _run_one_evaluation( # To make the old stack forward compatible with the new API stack metrics # structure, we add everything under the new key (EVALUATION_RESULTS) as well as # the old one ("evaluation"). - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: self.evaluation_metrics["evaluation"] = eval_results return self.evaluation_metrics @@ -3315,7 +3319,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread( env_steps += batch.env_steps() all_metrics.append(metrics) - if not self.config.uses_new_env_runners: + if not self.config.enable_env_runner_and_connector_v2: eval_results = summarize_episodes( all_metrics, all_metrics, @@ -3600,7 +3604,7 @@ def __enter__(self): self.time_start = time.time() self.sampled = 0 self.trained = 0 - if self.algo.config.uses_new_env_runners: + if self.algo.config.enable_env_runner_and_connector_v2: self.init_env_steps_sampled = self.algo.metrics.peek( NUM_ENV_STEPS_SAMPLED_LIFETIME ) @@ -3646,7 +3650,7 @@ def should_stop(self, results): return False # Stopping criteria. - if self.algo.config.uses_new_env_runners: + if self.algo.config.enable_env_runner_and_connector_v2: if self.algo.config.count_steps_by == "agent_steps": self.sampled = ( sum( diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 964efa0875cf..58fde3ec5dc3 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -527,8 +527,6 @@ def __init__(self, algo_class: Optional[type] = None): # TODO: Remove, once all deprecation_warning calls upon using these keys # have been removed. # === Deprecated keys === - self._enable_new_api_stack = DEPRECATED_VALUE - self.evaluation_num_workers = DEPRECATED_VALUE self.simple_optimizer = DEPRECATED_VALUE self.monitor = DEPRECATED_VALUE self.evaluation_num_episodes = DEPRECATED_VALUE @@ -4461,10 +4459,24 @@ def rollouts(self, *args, **kwargs): def exploration(self, *args, **kwargs): return self.env_runners(*args, **kwargs) - @Deprecated(new="AlgorithmConfig.enable_env_runner_and_connector_v2", error=True) @property + @Deprecated(new="AlgorithmConfig._enable_new_api_stack", error=False) + def _enable_new_api_stack(self): + return self.enable_rl_module_and_learner + + @_enable_new_api_stack.setter + def _enable_new_api_stack(self, value): + deprecation_warning( + old="AlgorithmConfig._enable_new_api_stack", + new="AlgorithmConfig.enable_rl_module_and_learner", + error=False, + ) + self.enable_rl_module_and_learner = value + + @property + @Deprecated(new="AlgorithmConfig.enable_env_runner_and_connector_v2", error=True) def uses_new_env_runners(self): - return None + pass @property @Deprecated(new="AlgorithmConfig.num_env_runners", error=False) @@ -4480,6 +4492,20 @@ def num_rollout_workers(self, value): ) self.num_env_runners = value + @property + @Deprecated(new="AlgorithmConfig.evaluation_num_workers", error=False) + def evaluation_num_workers(self): + return self.evaluation_num_env_runners + + @evaluation_num_workers.setter + def evaluation_num_workers(self, value): + deprecation_warning( + old="AlgorithmConfig.evaluation_num_workers", + new="AlgorithmConfig.evaluation_num_env_runners", + error=False, + ) + self.evaluation_num_env_runners = value + @property @Deprecated(new="AlgorithmConfig.num_envs_per_env_runner", error=False) def num_envs_per_worker(self): diff --git a/rllib/algorithms/sac/sac.py b/rllib/algorithms/sac/sac.py index 0d8aa17e399a..c58170cc44e7 100644 --- a/rllib/algorithms/sac/sac.py +++ b/rllib/algorithms/sac/sac.py @@ -447,7 +447,7 @@ def training_step(self) -> ResultDict: The results dict from executing the training iteration. """ # New API stack (RLModule, Learner, EnvRunner, ConnectorV2). - if self.config.uses_new_env_runners: + if self.config.enable_env_runner_and_connector_v2: return self._training_step_new_api_stack(with_noise_reset=False) # Old and hybrid API stacks (Policy, RolloutWorker, Connector, maybe RLModule, # maybe Learner). diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py index d5d035282a03..d1e45bed5624 100644 --- a/rllib/examples/evaluation/evaluation_parallel_to_training.py +++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py @@ -138,7 +138,7 @@ def on_train_result(self, *, algorithm: Algorithm, result: ResultDict, **kwargs) # `evaluation_num_env_runners` or `evaluation_parallel_to_training`). if eval_env_runner_results and NUM_EPISODES in eval_env_runner_results: num_episodes_done = eval_env_runner_results[NUM_EPISODES] - if algorithm.config.uses_new_env_runners: + if algorithm.config.enable_env_runner_and_connector_v2: num_timesteps_reported = eval_env_runner_results[NUM_ENV_STEPS_SAMPLED] else: num_timesteps_reported = eval_results["timesteps_this_iter"] From d29eaa7bd61b3d307abaefac979a9c493611e899 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 14:37:59 +0200 Subject: [PATCH 11/15] wip Signed-off-by: sven1977 --- rllib/evaluation/worker_set.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index f627b5ace3b0..73f6a3f4ed50 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -26,8 +26,6 @@ from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.env_context import EnvContext from ray.rllib.env.env_runner import EnvRunner -from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner -from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner from ray.rllib.offline import get_dataset_and_shards from ray.rllib.policy.policy import Policy, PolicyState from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID @@ -144,8 +142,14 @@ def __init__( if self.env_runner_cls is None: if config.enable_env_runner_and_connector_v2: if config.is_multi_agent(): + from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner + self.env_runner_cls = MultiAgentEnvRunner else: + from ray.rllib.env.single_agent_env_runner import ( + SingleAgentEnvRunner, + ) + self.env_runner_cls = SingleAgentEnvRunner else: self.env_runner_cls = RolloutWorker From 17085396a1a8611b1e2fa5ed75c8d2308c25be40 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 15:30:27 +0200 Subject: [PATCH 12/15] wip Signed-off-by: sven1977 --- rllib/examples/multi_agent/different_spaces_for_agents.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rllib/examples/multi_agent/different_spaces_for_agents.py b/rllib/examples/multi_agent/different_spaces_for_agents.py index ffbc45255380..ec543de185fc 100644 --- a/rllib/examples/multi_agent/different_spaces_for_agents.py +++ b/rllib/examples/multi_agent/different_spaces_for_agents.py @@ -33,8 +33,8 @@ class BasicMultiAgentMultiSpaces(MultiAgentEnv): """A simple multi-agent example environment where agents have different spaces. - agent0: obs=(10,), act=Discrete(2) - agent1: obs=(20,), act=Discrete(3) + agent0: obs=Box(10,), act=Discrete(2) + agent1: obs=Box(20,), act=Discrete(3) The logic of the env doesn't really matter for this example. The point of this env is to show how to use multi-agent envs, in which the different agents utilize From 963409ee837d8a10897fbd4d91aadc1b5774bd2b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 15:40:36 +0200 Subject: [PATCH 13/15] wip Signed-off-by: sven1977 --- rllib/utils/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 9f7d33f44bd5..a9aac8d012b5 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1389,7 +1389,7 @@ def run_rllib_example_script_experiment( ] print(f" R(eval)={Reval}", end="") print() - for key, value in stop.items(): + for key, threshold in stop.items(): val = results for k in key.split("/"): try: @@ -1397,8 +1397,8 @@ def run_rllib_example_script_experiment( except KeyError: val = None break - if val is not None and val >= value: - print(f"Stop criterium ({key}={value}) fulfilled!") + if val is not None and not np.isnan(val) and val >= threshold: + print(f"Stop criterium ({key}={threshold}) fulfilled!") return results ray.shutdown() return results From 264431dc23237a6c431f18fa263e03e4b231709b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 16:01:26 +0200 Subject: [PATCH 14/15] wip Signed-off-by: sven1977 --- rllib/env/multi_agent_env_runner.py | 78 +++++++++++++++++----------- rllib/env/single_agent_env_runner.py | 68 +++++++++++++----------- rllib/utils/metrics/stats.py | 3 +- 3 files changed, 90 insertions(+), 59 deletions(-) diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 547dddf18359..cf2facb1b0ae 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -3,6 +3,7 @@ from collections import defaultdict from functools import partial +import numpy as np from typing import DefaultDict, Dict, List, Optional from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -624,37 +625,18 @@ def get_metrics(self) -> ResultDict: module_episode_returns[sa_eps.module_id] += return_eps2 del self._ongoing_episodes_for_metrics[eps.id_] - # Log general episode metrics. - self.metrics.log_dict( - { - "episode_len_mean": episode_length, - "episode_return_mean": episode_return, - "episode_duration_sec_mean": episode_duration_s, - # Per-agent returns. - "agent_episode_returns_mean": agent_episode_returns, - # Per-RLModule returns. - "module_episode_returns_mean": module_episode_returns, - }, - # To mimick the old API stack behavior, we'll use `window` here for - # these particular stats (instead of the default EMA). - window=self.config.metrics_num_episodes_for_smoothing, - ) - # For some metrics, log min/max as well. - self.metrics.log_dict( - { - "episode_len_min": episode_length, - "episode_return_min": episode_return, - }, - reduce="min", - ) - self.metrics.log_dict( - { - "episode_len_max": episode_length, - "episode_return_max": episode_return, - }, - reduce="max", + self._log_episode_metrics( + episode_length, + episode_return, + episode_duration_s, + agent_episode_returns, + module_episode_returns, ) + # If no episodes at all, log NaN stats. + if len(self._done_episodes_for_metrics) == 0: + self._log_episode_metrics(np.nan, np.nan, np.nan) + # Log num episodes counter for this iteration. self.metrics.log_value( NUM_EPISODES, @@ -865,3 +847,41 @@ def _make_on_episode_callback(self, which: str, episode=None): rl_module=self.module, env_index=0, ) + + def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): + # Log general episode metrics. + self.metrics.log_dict( + { + "episode_len_mean": length, + "episode_return_mean": ret, + "episode_duration_sec_mean": sec, + **( + { + # Per-agent returns. + "agent_episode_returns_mean": agents, + # Per-RLModule returns. + "module_episode_returns_mean": modules, + } + if agents is not None + else {} + ), + }, + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + window=self.config.metrics_num_episodes_for_smoothing, + ) + # For some metrics, log min/max as well. + self.metrics.log_dict( + { + "episode_len_min": length, + "episode_return_min": ret, + }, + reduce="min", + ) + self.metrics.log_dict( + { + "episode_len_max": length, + "episode_return_max": ret, + }, + reduce="max", + ) diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index a468de8990af..a3610c8e9162 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -4,6 +4,7 @@ from collections import defaultdict from functools import partial +import numpy as np from typing import DefaultDict, Dict, List, Optional from ray.rllib.algorithms.algorithm_config import AlgorithmConfig @@ -587,37 +588,14 @@ def get_metrics(self) -> ResultDict: episode_duration_s += eps2.get_duration_s() del self._ongoing_episodes_for_metrics[eps.id_] - # Log general episode metrics. - self.metrics.log_dict( - { - "episode_len_mean": episode_length, - "episode_return_mean": episode_return, - "episode_duration_sec_mean": episode_duration_s, - # Per-agent returns. - "agent_episode_returns_mean": {DEFAULT_AGENT_ID: episode_return}, - # Per-RLModule returns. - "module_episode_returns_mean": {DEFAULT_MODULE_ID: episode_return}, - }, - # To mimick the old API stack behavior, we'll use `window` here for - # these particular stats (instead of the default EMA). - window=self.config.metrics_num_episodes_for_smoothing, - ) - # For some metrics, log min/max as well. - self.metrics.log_dict( - { - "episode_len_min": episode_length, - "episode_return_min": episode_return, - }, - reduce="min", - ) - self.metrics.log_dict( - { - "episode_len_max": episode_length, - "episode_return_max": episode_return, - }, - reduce="max", + self._log_episode_metrics( + episode_length, episode_return, episode_duration_s ) + # If no episodes at all, log NaN stats. + if len(self._done_episodes_for_metrics) == 0: + self._log_episode_metrics(np.nan, np.nan, np.nan) + # Log num episodes counter for this iteration. self.metrics.log_value( NUM_EPISODES, @@ -776,3 +754,35 @@ def _convert_to_tensor(self, struct) -> TensorType: return convert_to_torch_tensor(struct) else: return tree.map_structure(tf.convert_to_tensor, struct) + + def _log_episode_metrics(self, length, ret, sec): + # Log general episode metrics. + self.metrics.log_dict( + { + "episode_len_mean": length, + "episode_return_mean": ret, + "episode_duration_sec_mean": sec, + # Per-agent returns. + "agent_episode_returns_mean": {DEFAULT_AGENT_ID: ret}, + # Per-RLModule returns. + "module_episode_returns_mean": {DEFAULT_MODULE_ID: ret}, + }, + # To mimick the old API stack behavior, we'll use `window` here for + # these particular stats (instead of the default EMA). + window=self.config.metrics_num_episodes_for_smoothing, + ) + # For some metrics, log min/max as well. + self.metrics.log_dict( + { + "episode_len_min": length, + "episode_return_min": ret, + }, + reduce="min", + ) + self.metrics.log_dict( + { + "episode_len_max": length, + "episode_return_max": ret, + }, + reduce="max", + ) diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index 87dbc5571813..422c36c0c4cf 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -413,7 +413,8 @@ def _reduced_values(self) -> Tuple[Any, Any]: return mean_value, [mean_value] # Do non-EMA reduction (possibly using a window). else: - reduce_meth = getattr(np, self._reduce_method) + # Use the numpy "nan"-prefix to ignore NaN's in our value lists. + reduce_meth = getattr(np, "nan" + self._reduce_method) values = ( self.values if self._window is None else self.values[-self._window :] ) From 118e6f4fd5db329a68160c0b6f7f8e29f578d79f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 26 Apr 2024 17:04:40 +0200 Subject: [PATCH 15/15] wip Signed-off-by: sven1977 --- rllib/algorithms/dreamerv3/dreamerv3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py index 373bbc9b654e..1111813b645f 100644 --- a/rllib/algorithms/dreamerv3/dreamerv3.py +++ b/rllib/algorithms/dreamerv3/dreamerv3.py @@ -150,6 +150,7 @@ def __init__(self, algo_class=None): self.remote_worker_envs = True # Dreamer only runs on the new API stack. self.enable_rl_module_and_learner = True + self.enable_env_runner_and_connector_v2 = True # __sphinx_doc_end__ # fmt: on